From b4f79022adb138bfe850696a6ef7470aa06d06ee Mon Sep 17 00:00:00 2001 From: Xiaoxuan Meng Date: Fri, 18 Oct 2024 22:58:28 -0700 Subject: [PATCH] Global memory arbitration optimization (#11262) Summary: This PR adds global memory arbitration optimization which decouples the frontend memory arbitration request and the backend slow memory arbitration processing. This enables to (1) remove the global locking inside the arbitrator for slow memory arbitration processing; (2) respect e2e memory arbitration time to avoid the potential node level deadlock that has happened in the production in the past; (3) backend optimization such as query level memory reclamation parallelism; (4) more advanced memory arbitration policy such as respect older query than younger when abort to allow the old slow query having a chance to run through and also make a more consistent abort choice across nodes in a distributed execution environment like Prestissimo instead of just relying on a query's current capacity. Will update existing memory design doc reflect the internal change in followup. Shadowed this in Prestissimo batch shadow and LBM stress test for reliability (for LBM stress test, the memory checker gets pushback signal from the jemalloc earlier). For performance measure by taking a spilled heavy query from Prestissimo batch production and running 25 of them in parallel, the landing time (the time of the first query to the end of the last query) has been reduced from 22mins down to 14 mins. The averaged execution time is reduced by half. The speedup is mostly from query level spill parallelism as well as the global locking reduction. The memory arbitration time has been reduced form 40k mins to 17k mins, and the driver queue time has been reduced from 19k mins to 7k mins. This stress test has flakiness but the reduction in memory arbitration wall-time is consistent across runs. Also the total amount of data to spill are also consistent across runs at around 22TB The followup needs to fix the issues on spilling execution path exposed by the stress test. Orri's review patch: P1654857111 Reviewed By: tanjialiang, oerling Differential Revision: D63902323 --- velox/common/base/Counters.cpp | 18 + velox/common/base/Counters.h | 10 +- velox/common/base/SuccinctPrinter.cpp | 8 +- velox/common/base/VeloxException.h | 67 +- .../common/base/tests/SuccinctPrinterTest.cpp | 11 + velox/common/memory/ArbitrationOperation.h | 2 +- .../common/memory/ArbitrationParticipant.cpp | 45 +- velox/common/memory/ArbitrationParticipant.h | 34 +- velox/common/memory/MemoryArbitrator.cpp | 7 + velox/common/memory/MemoryArbitrator.h | 2 +- velox/common/memory/MemoryPool.cpp | 4 +- velox/common/memory/SharedArbitrator.cpp | 1666 +++++---- velox/common/memory/SharedArbitrator.h | 661 ++-- .../tests/ArbitrationParticipantTest.cpp | 164 +- .../memory/tests/MemoryArbitratorTest.cpp | 12 +- .../common/memory/tests/MemoryManagerTest.cpp | 23 +- velox/common/memory/tests/MemoryPoolTest.cpp | 38 +- .../memory/tests/MockSharedArbitratorTest.cpp | 2988 ++++++++++------- .../memory/tests/SharedArbitratorTest.cpp | 21 +- .../memory/tests/SharedArbitratorTestUtil.h | 92 + velox/core/PlanNode.h | 2 +- velox/docs/monitoring/metrics.rst | 10 +- velox/exec/Task.cpp | 2 +- velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp | 9 +- velox/exec/tests/AggregationTest.cpp | 277 +- velox/exec/tests/HashJoinTest.cpp | 3 +- velox/exec/tests/utils/ArbitratorTestUtil.h | 2 +- velox/exec/tests/utils/OperatorTestBase.cpp | 10 +- velox/exec/tests/utils/OperatorTestBase.h | 4 +- 29 files changed, 3530 insertions(+), 2662 deletions(-) create mode 100644 velox/common/memory/tests/SharedArbitratorTestUtil.h diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index 4b8dbf5156b2..1ff64da4232c 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -371,6 +371,24 @@ void registerVeloxMetrics() { kMetricArbitratorGlobalArbitrationCount, facebook::velox::StatType::COUNT); + // The number of victims distribution of a global arbitration run [0, 32] with + // 32 buckets. It is configured to report the number of victims at P50, P90, + // P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricArbitratorGlobalArbitrationNumReclaimVictims, + 1, + 0, + 32, + 50, + 90, + 99, + 100); + + // The number of victim query memory pool having nothing to spill. + DEFINE_METRIC( + kMetricArbitratorGlobalArbitrationFailedVictimCount, + facebook::velox::StatType::COUNT); + // The time distribution of a global arbitration run [0, 600s] with 20 // buckets. It is configured to report the latency at P50, P90, P99, and P100 // percentiles. diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index 0d26052bc929..4ee6da39d9c1 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -56,7 +56,7 @@ constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeoutCount{ "velox.task_memory_reclaim_wait_timeout_count"}; constexpr folly::StringPiece kMetricOpMemoryReclaimTimeMs{ - "velox.op_memory_reclaim_ms"}; + "velox.op_memory_reclaim_time_ms"}; constexpr folly::StringPiece kMetricOpMemoryReclaimedBytes{ "velox.op_memory_reclaim_bytes"}; @@ -88,6 +88,14 @@ constexpr folly::StringPiece kMetricArbitratorLocalArbitrationCount{ constexpr folly::StringPiece kMetricArbitratorGlobalArbitrationCount{ "velox.arbitrator_global_arbitration_count"}; +constexpr folly::StringPiece + kMetricArbitratorGlobalArbitrationNumReclaimVictims{ + "velox.arbitrator_global_arbitration_num_reclaim_victims"}; + +constexpr folly::StringPiece + kMetricArbitratorGlobalArbitrationFailedVictimCount{ + "velox.arbitrator_global_arbitration_failed_victim_count"}; + constexpr folly::StringPiece kMetricArbitratorGlobalArbitrationBytes{ "velox.arbitrator_global_arbitration_bytes"}; diff --git a/velox/common/base/SuccinctPrinter.cpp b/velox/common/base/SuccinctPrinter.cpp index 1a0023da9147..2c3ae78691a2 100644 --- a/velox/common/base/SuccinctPrinter.cpp +++ b/velox/common/base/SuccinctPrinter.cpp @@ -40,7 +40,7 @@ namespace { /// Possible units are days(d), hours(h), minutes(m), seconds(s). std::string succinctSeconds(uint64_t seconds) { std::stringstream out; - int days = seconds / kSecondsInDay; + const uint64_t days = seconds / kSecondsInDay; bool isFirstUnit = true; if (days) { out << days << "d"; @@ -48,7 +48,7 @@ std::string succinctSeconds(uint64_t seconds) { } seconds -= days * kSecondsInDay; - int hours = seconds / kSecondsInHour; + const uint64_t hours = seconds / kSecondsInHour; if (days || hours) { if (!isFirstUnit) { out << " "; @@ -58,11 +58,12 @@ std::string succinctSeconds(uint64_t seconds) { } seconds -= hours * kSecondsInHour; - int minutes = seconds / kSecondsInMinute; + const uint64_t minutes = seconds / kSecondsInMinute; if (days || hours || minutes) { if (!isFirstUnit) { out << " "; } + out << minutes << "m"; isFirstUnit = false; } @@ -109,6 +110,7 @@ std::string succinctDuration(uint64_t duration, int unitOffset, int precision) { std::round((duration * 1.0) / kTimeUnitsInSecond[unitOffset]); return succinctSeconds(seconds); } + return succinctPrint( duration, &kTimeUnits[0], diff --git a/velox/common/base/VeloxException.h b/velox/common/base/VeloxException.h index bb22e9a490c2..fa8d18e15d09 100644 --- a/velox/common/base/VeloxException.h +++ b/velox/common/base/VeloxException.h @@ -40,80 +40,83 @@ namespace velox { namespace error_source { using namespace folly::string_literals; -// Errors where the root cause of the problem is either because of bad input -// or an unsupported pattern of use are classified with source USER. Examples -// of errors in this category include syntax errors, unavailable names or -// objects. +/// Errors where the root cause of the problem is either because of bad input +/// or an unsupported pattern of use are classified with source USER. Examples +/// of errors in this category include syntax errors, unavailable names or +/// objects. inline constexpr auto kErrorSourceUser = "USER"_fs; -// Errors where the root cause of the problem is an unexpected internal state in -// the system. +/// Errors where the root cause of the problem is an unexpected internal state +/// in the system. inline constexpr auto kErrorSourceRuntime = "RUNTIME"_fs; -// Errors where the root cause of the problem is some unreliable aspect of the -// system are classified with source SYSTEM. +/// Errors where the root cause of the problem is some unreliable aspect of the +/// system are classified with source SYSTEM. inline constexpr auto kErrorSourceSystem = "SYSTEM"_fs; } // namespace error_source namespace error_code { using namespace folly::string_literals; -//====================== User Error Codes ======================: +///====================== User Error Codes ======================: -// A generic user error code +/// A generic user error code inline constexpr auto kGenericUserError = "GENERIC_USER_ERROR"_fs; -// An error raised when an argument verification fails +/// An error raised when an argument verification fails inline constexpr auto kInvalidArgument = "INVALID_ARGUMENT"_fs; -// An error raised when a requested operation is not supported. +/// An error raised when a requested operation is not supported. inline constexpr auto kUnsupported = "UNSUPPORTED"_fs; -// Arithmetic errors - underflow, overflow, divide by zero etc. +/// Arithmetic errors - underflow, overflow, divide by zero etc. inline constexpr auto kArithmeticError = "ARITHMETIC_ERROR"_fs; -// Arithmetic errors - underflow, overflow, divide by zero etc. +/// Arithmetic errors - underflow, overflow, divide by zero etc. inline constexpr auto kSchemaMismatch = "SCHEMA_MISMATCH"_fs; -//====================== Runtime Error Codes ======================: +///====================== Runtime Error Codes ======================: -// An error raised when the current state of a component is invalid. +/// An error raised when the current state of a component is invalid. inline constexpr auto kInvalidState = "INVALID_STATE"_fs; -// An error raised when unreachable code point was executed. +/// An error raised when unreachable code point was executed. inline constexpr auto kUnreachableCode = "UNREACHABLE_CODE"_fs; -// An error raised when a requested operation is not yet supported. +/// An error raised when a requested operation is not yet supported. inline constexpr auto kNotImplemented = "NOT_IMPLEMENTED"_fs; -// An error raised when memory pool exceeds limits. +/// An error raised when memory pool exceeds limits. inline constexpr auto kMemCapExceeded = "MEM_CAP_EXCEEDED"_fs; -// An error raised when memory pool is aborted. +/// An error raised when memory pool is aborted. inline constexpr auto kMemAborted = "MEM_ABORTED"_fs; -// Error caused by memory allocation failure (inclusive of allocator memory cap -// exceeded). +/// An error raised when memory arbitration is timed out. +inline constexpr auto kMemArbitrationTimeout = "MEM_ARBITRATION_TIMEOUT"_fs; + +/// Error caused by memory allocation failure (inclusive of allocator memory cap +/// exceeded). inline constexpr auto kMemAllocError = "MEM_ALLOC_ERROR"_fs; -// Error caused by failing to allocate cache buffer space for IO. +/// Error caused by failing to allocate cache buffer space for IO. inline constexpr auto kNoCacheSpace = "NO_CACHE_SPACE"_fs; -// An error raised when spill bytes exceeds limits. +/// An error raised when spill bytes exceeds limits. inline constexpr auto kSpillLimitExceeded = "SPILL_LIMIT_EXCEEDED"_fs; -// Errors indicating file read corruptions. +/// Errors indicating file read corruptions. inline constexpr auto kFileCorruption = "FILE_CORRUPTION"_fs; -// Errors indicating file not found. +/// Errors indicating file not found. inline constexpr auto kFileNotFound = "FILE_NOT_FOUND"_fs; -// We do not know how to classify it yet. +/// We do not know how to classify it yet. inline constexpr auto kUnknown = "UNKNOWN"_fs; -// VeloxRuntimeErrors due to unsupported input values such as unicode input to -// cast-varchar-to-integer and timestamps beyond the year 2037 to datetime -// functions. This kind of errors is allowed in expression fuzzer. +/// VeloxRuntimeErrors due to unsupported input values such as unicode input to +/// cast-varchar-to-integer and timestamps beyond the year 2037 to datetime +/// functions. This kind of errors is allowed in expression fuzzer. inline constexpr auto kUnsupportedInputUncatchable = "UNSUPPORTED_INPUT_UNCATCHABLE"_fs; } // namespace error_code @@ -160,12 +163,12 @@ class VeloxException : public std::exception { exceptionType, exceptionName) {} - // Inherited + /// Inherited const char* what() const noexcept override { return state_->what(); } - // Introduced nonvirtuals + /// Introduced nonvirtuals const process::StackTrace* stackTrace() const { return state_->stackTrace.get(); } diff --git a/velox/common/base/tests/SuccinctPrinterTest.cpp b/velox/common/base/tests/SuccinctPrinterTest.cpp index ffa38314f369..074ad11234f2 100644 --- a/velox/common/base/tests/SuccinctPrinterTest.cpp +++ b/velox/common/base/tests/SuccinctPrinterTest.cpp @@ -37,6 +37,9 @@ TEST(SuccinctPrinterTest, testSuccinctNanos) { EXPECT_EQ(succinctNanos(86'399'499'000'000), "23h 59m 59s"); EXPECT_EQ(succinctNanos(86'400'123'000'000), "1d 0h 0m 0s"); EXPECT_EQ(succinctNanos(867'661'789'000'000), "10d 1h 1m 2s"); + EXPECT_EQ( + succinctNanos(std::numeric_limits::max()), + "213503d 23h 34m 34s"); } TEST(SuccinctPrinterTest, testSuccinctMicros) { @@ -51,6 +54,9 @@ TEST(SuccinctPrinterTest, testSuccinctMicros) { EXPECT_EQ(succinctMicros(86'399'498), "1m 26s"); EXPECT_EQ(succinctMicros(86'400'123), "1m 26s"); EXPECT_EQ(succinctMicros(867'661'789), "14m 28s"); + EXPECT_EQ( + succinctMicros(std::numeric_limits::max()), + "213503982d 8h 1m 50s"); } TEST(SuccinctPrinterTest, testSuccinctMillis) { @@ -65,6 +71,9 @@ TEST(SuccinctPrinterTest, testSuccinctMillis) { EXPECT_EQ(succinctMillis(86'399'498), "23h 59m 59s"); EXPECT_EQ(succinctMillis(86'400'123), "1d 0h 0m 0s"); EXPECT_EQ(succinctMillis(867'661'789), "10d 1h 1m 2s"); + EXPECT_EQ( + succinctMillis(std::numeric_limits::max()), + "213503982334d 14h 25m 52s"); } TEST(SuccinctPrinterTest, testSuccinctBytes) { @@ -77,6 +86,8 @@ TEST(SuccinctPrinterTest, testSuccinctBytes) { EXPECT_EQ(succinctBytes(1'234'567'890), "1.15GB"); EXPECT_EQ(succinctBytes(1'099'511'627'776), "1.00TB"); EXPECT_EQ(succinctBytes(1234'099'511'627'776), "1122.41TB"); + EXPECT_EQ( + succinctBytes(std::numeric_limits::max()), "16777216.00TB"); } } // namespace facebook::velox diff --git a/velox/common/memory/ArbitrationOperation.h b/velox/common/memory/ArbitrationOperation.h index 7526c8c04505..0096d15ee057 100644 --- a/velox/common/memory/ArbitrationOperation.h +++ b/velox/common/memory/ArbitrationOperation.h @@ -105,7 +105,7 @@ class ArbitrationOperation { /// Invoked to mark the start of global arbitration. This is used to measure /// how much time spent in waiting for global arbitration. - void startGlobalArbitration() { + void recordGlobalArbitrationStartTime() { VELOX_CHECK_EQ(globalArbitrationStartTimeMs_, 0); VELOX_CHECK_EQ(state_, State::kRunning); globalArbitrationStartTimeMs_ = getCurrentTimeMs(); diff --git a/velox/common/memory/ArbitrationParticipant.cpp b/velox/common/memory/ArbitrationParticipant.cpp index aecb8087f49e..5a03ebb5033b 100644 --- a/velox/common/memory/ArbitrationParticipant.cpp +++ b/velox/common/memory/ArbitrationParticipant.cpp @@ -31,13 +31,15 @@ using namespace facebook::velox::memory; std::string ArbitrationParticipant::Config::toString() const { return fmt::format( - "initCapacity {}, minCapacity {}, fastExponentialGrowthCapacityLimit {}, slowCapacityGrowRatio {}, minFreeCapacity {}, minFreeCapacityRatio {}", + "initCapacity {}, minCapacity {}, fastExponentialGrowthCapacityLimit {}, slowCapacityGrowRatio {}, minFreeCapacity {}, minFreeCapacityRatio {}, minReclaimBytes {}, abortCapacityLimit {}", succinctBytes(initCapacity), succinctBytes(minCapacity), succinctBytes(fastExponentialGrowthCapacityLimit), slowCapacityGrowRatio, succinctBytes(minFreeCapacity), - minFreeCapacityRatio); + minFreeCapacityRatio, + succinctBytes(minReclaimBytes), + succinctBytes(abortCapacityLimit)); } ArbitrationParticipant::Config::Config( @@ -46,13 +48,17 @@ ArbitrationParticipant::Config::Config( uint64_t _fastExponentialGrowthCapacityLimit, double _slowCapacityGrowRatio, uint64_t _minFreeCapacity, - double _minFreeCapacityRatio) + double _minFreeCapacityRatio, + uint64_t _minReclaimBytes, + uint64_t _abortCapacityLimit) : initCapacity(_initCapacity), minCapacity(_minCapacity), fastExponentialGrowthCapacityLimit(_fastExponentialGrowthCapacityLimit), slowCapacityGrowRatio(_slowCapacityGrowRatio), minFreeCapacity(_minFreeCapacity), - minFreeCapacityRatio(_minFreeCapacityRatio) { + minFreeCapacityRatio(_minFreeCapacityRatio), + minReclaimBytes(_minReclaimBytes), + abortCapacityLimit(_abortCapacityLimit) { VELOX_CHECK_GE(slowCapacityGrowRatio, 0); VELOX_CHECK_EQ( fastExponentialGrowthCapacityLimit == 0, @@ -73,6 +79,10 @@ ArbitrationParticipant::Config::Config( "adjustment.", minFreeCapacity, minFreeCapacityRatio); + VELOX_CHECK( + bits::isPowerOfTwo(abortCapacityLimit), + "abortCapacityLimit {} not a power of two", + abortCapacityLimit); } std::shared_ptr ArbitrationParticipant::create( @@ -251,7 +261,9 @@ void ArbitrationParticipant::finishArbitration(ArbitrationOperation* op) { uint64_t ArbitrationParticipant::reclaim( uint64_t targetBytes, - uint64_t maxWaitTimeMs) noexcept { + uint64_t maxWaitTimeMs, + MemoryReclaimer::Stats& stats) noexcept { + targetBytes = std::max(targetBytes, config_->minReclaimBytes); if (targetBytes == 0) { return 0; } @@ -259,16 +271,17 @@ uint64_t ArbitrationParticipant::reclaim( TestValue::adjust( "facebook::velox::memory::ArbitrationParticipant::reclaim", this); uint64_t reclaimedBytes{0}; - MemoryReclaimer::Stats reclaimStats; try { ++numReclaims_; - pool_->reclaim(targetBytes, maxWaitTimeMs, reclaimStats); + VELOX_MEM_LOG(INFO) << "Reclaiming from memory pool " << pool_->name() + << " with target " << succinctBytes(targetBytes); + pool_->reclaim(targetBytes, maxWaitTimeMs, stats); + reclaimedBytes = shrink(/*reclaimAll=*/false); } catch (const std::exception& e) { VELOX_MEM_LOG(ERROR) << "Failed to reclaim from memory pool " << pool_->name() << ", aborting it: " << e.what(); - abortLocked(std::current_exception()); + reclaimedBytes = abortLocked(std::current_exception()); } - reclaimedBytes = shrink(/*reclaimAll=*/true); return reclaimedBytes; } @@ -286,6 +299,10 @@ bool ArbitrationParticipant::grow( uint64_t ArbitrationParticipant::shrink(bool reclaimAll) { std::lock_guard l(stateLock_); + return shrinkLocked(reclaimAll); +} + +uint64_t ArbitrationParticipant::shrinkLocked(bool reclaimAll) { ++numShrinks_; uint64_t reclaimedBytes{0}; @@ -316,18 +333,24 @@ uint64_t ArbitrationParticipant::abortLocked( if (aborted_) { return 0; } - aborted_ = true; } try { + VELOX_MEM_LOG(WARNING) << "Memory pool " << pool_->name() + << " is being aborted"; pool_->abort(error); } catch (const std::exception& e) { VELOX_MEM_LOG(WARNING) << "Failed to abort memory pool " << pool_->toString() << ", error: " << e.what(); } + VELOX_MEM_LOG(WARNING) << "Memory pool " << pool_->name() << " aborted"; // NOTE: no matter query memory pool abort throws or not, it should have been // marked as aborted to prevent any new memory arbitration operations. VELOX_CHECK(pool_->aborted()); - return shrink(/*reclaimAll=*/true); + + std::lock_guard l(stateLock_); + VELOX_CHECK(!aborted_); + aborted_ = true; + return shrinkLocked(/*reclaimAll=*/true); } bool ArbitrationParticipant::waitForReclaimOrAbort( diff --git a/velox/common/memory/ArbitrationParticipant.h b/velox/common/memory/ArbitrationParticipant.h index 0b115651ce6f..26c20b320160 100644 --- a/velox/common/memory/ArbitrationParticipant.h +++ b/velox/common/memory/ArbitrationParticipant.h @@ -25,6 +25,9 @@ #include "velox/common/memory/Memory.h" namespace facebook::velox::memory { +namespace test { +class ArbitrationParticipantTestHelper; +} class ArbitrationOperation; class ScopedArbitrationParticipant; @@ -79,13 +82,34 @@ class ArbitrationParticipant uint64_t minFreeCapacity; double minFreeCapacityRatio; + /// Specifies the minimum bytes to reclaim from a participant at a time. The + /// global arbitration also avoids to reclaim from a participant if its + /// reclaimable used capacity is less than this threshold. This is to + /// prevent inefficient memory reclaim operations on a participant with + /// small reclaimable used capacity which could causes a large number of + /// small spilled file on disk. + uint64_t minReclaimBytes; + + /// Specifies the starting memory capacity limit for global arbitration to + /// search for victim participant to reclaim used memory by abort. For + /// participants with capacity larger than the limit, the global arbitration + /// choose to abort the youngest participant which has the largest + /// participant id. This helps to let the old queries to run to completion. + /// The abort capacity limit is reduced by half if couldn't find a victim + /// participant until reaches to zero. + /// + /// NOTE: the limit must be zero or a power of 2. + uint64_t abortCapacityLimit; + Config( uint64_t _initCapacity, uint64_t _minCapacity, uint64_t _fastExponentialGrowthCapacityLimit, double _slowCapacityGrowRatio, uint64_t _minFreeCapacity, - double _minFreeCapacityRatio); + double _minFreeCapacityRatio, + uint64_t _minReclaimBytes, + uint64_t _abortCapacityLimit); std::string toString() const; }; @@ -184,7 +208,10 @@ class ArbitrationParticipant // Invoked to reclaim used memory from this memory pool with specified // 'targetBytes'. The function returns the actually freed capacity. - uint64_t reclaim(uint64_t targetBytes, uint64_t maxWaitTimeMs) noexcept; + uint64_t reclaim( + uint64_t targetBytes, + uint64_t maxWaitTimeMs, + MemoryReclaimer::Stats& stats) noexcept; /// Invoked to abort the query memory pool and returns the reclaimed bytes /// after abort. @@ -276,6 +303,8 @@ class ArbitrationParticipant // Aborts the query memory pool and returns the reclaimed bytes after abort. uint64_t abortLocked(const std::exception_ptr& error) noexcept; + uint64_t shrinkLocked(bool reclaimAll); + const uint64_t id_; const std::weak_ptr poolWeakPtr_; MemoryPool* const pool_; @@ -307,6 +336,7 @@ class ArbitrationParticipant mutable std::timed_mutex reclaimLock_; friend class ScopedArbitrationParticipant; + friend class test::ArbitrationParticipantTestHelper; }; /// The wrapper of the arbitration participant which holds a shared reference to diff --git a/velox/common/memory/MemoryArbitrator.cpp b/velox/common/memory/MemoryArbitrator.cpp index 648f8d615ab6..8dcce079e3c9 100644 --- a/velox/common/memory/MemoryArbitrator.cpp +++ b/velox/common/memory/MemoryArbitrator.cpp @@ -541,6 +541,13 @@ ScopedReclaimedBytesRecorder::~ScopedReclaimedBytesRecorder() { return; } const int64_t reservedBytesAfterReclaim = pool_->reservedBytes(); + if (reservedBytesAfterReclaim > reservedBytesBeforeReclaim_) { + LOG(ERROR) << "Unexpected reserved bytes growth from " << pool_->name() + << " after memory reclaim from " + << succinctBytes(reservedBytesBeforeReclaim_) << " to " + << succinctBytes(reservedBytesAfterReclaim) << ", current usage " + << succinctBytes(pool_->usedBytes()); + } *reclaimedBytes_ = reservedBytesBeforeReclaim_ - reservedBytesAfterReclaim; } } // namespace facebook::velox::memory diff --git a/velox/common/memory/MemoryArbitrator.h b/velox/common/memory/MemoryArbitrator.h index 8796733996df..043472b9b564 100644 --- a/velox/common/memory/MemoryArbitrator.h +++ b/velox/common/memory/MemoryArbitrator.h @@ -264,7 +264,7 @@ class MemoryReclaimer { /// due to reclaiming at non-reclaimable stage. uint64_t numNonReclaimableAttempts{0}; - /// The total execution time to do the reclaim in microseconds. + /// The total time to do the reclaim in microseconds. uint64_t reclaimExecTimeUs{0}; /// The total reclaimed memory bytes. diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp index 39d2fde38a02..98a4ff3c8cc4 100644 --- a/velox/common/memory/MemoryPool.cpp +++ b/velox/common/memory/MemoryPool.cpp @@ -1119,10 +1119,10 @@ void MemoryPoolImpl::abort(const std::exception_ptr& error) { parent_->abort(error); return; } + setAbortError(error); if (reclaimer() == nullptr) { - VELOX_FAIL("Can't abort the memory pool {} without reclaimer", name_); + return; } - setAbortError(error); reclaimer()->abort(this, error); } diff --git a/velox/common/memory/SharedArbitrator.cpp b/velox/common/memory/SharedArbitrator.cpp index 5fea61692fb5..c9c553a6269c 100644 --- a/velox/common/memory/SharedArbitrator.cpp +++ b/velox/common/memory/SharedArbitrator.cpp @@ -15,8 +15,9 @@ */ #include "velox/common/memory/SharedArbitrator.h" +#include #include - +#include "velox/common/base/AsyncSource.h" #include "velox/common/base/Exceptions.h" #include "velox/common/base/RuntimeMetrics.h" #include "velox/common/config/Config.h" @@ -31,38 +32,21 @@ namespace facebook::velox::memory { using namespace facebook::velox::memory; namespace { +#define RETURN_IF_TRUE(func) \ + { \ + const bool ret = func; \ + if (ret) { \ + return ret; \ + } \ + } -// Returns the max capacity to grow of memory 'pool'. The calculation is based -// on a memory pool's max capacity and its current capacity. -uint64_t maxGrowCapacity(const MemoryPool& pool) { - return pool.maxCapacity() - pool.capacity(); -} - -// Returns the capacity of the memory pool with the specified growth target. -uint64_t capacityAfterGrowth(const MemoryPool& pool, uint64_t targetBytes) { - return pool.capacity() + targetBytes; -} - -std::string memoryPoolAbortMessage( - MemoryPool* victim, - MemoryPool* requestor, - size_t growBytes) { - std::stringstream out; - VELOX_CHECK(victim->isRoot()); - VELOX_CHECK(requestor->isRoot()); - if (requestor == victim) { - out << "\nFailed memory pool '" << victim->name() - << "' aborted by itself when tried to grow " << succinctBytes(growBytes) - << "\n"; - } else { - out << "\nFailed memory pool '" << victim->name() - << "' aborted when requestor '" << requestor->name() - << "' tried to grow " << succinctBytes(growBytes) << "\n"; +#define CHECKED_GROW(pool, growBytes, reservationBytes) \ + try { \ + checkedGrow(pool, growBytes, reservationBytes); \ + } catch (const VeloxRuntimeError& e) { \ + freeCapacity(growBytes); \ + throw; \ } - out << "Memory usage of the failed memory pool:\n" - << victim->treeMemoryUsage(); - return out.str(); -} template T getConfig( @@ -79,6 +63,15 @@ T getConfig( } return defaultValue; } + +#define VELOX_MEM_ARBITRATION_TIMEOUT(errorMessage) \ + _VELOX_THROW( \ + ::facebook::velox::VeloxRuntimeError, \ + ::facebook::velox::error_source::kErrorSourceRuntime.c_str(), \ + ::facebook::velox::error_code::kMemArbitrationTimeout.c_str(), \ + /* isRetriable */ true, \ + "{}", \ + errorMessage); } // namespace int64_t SharedArbitrator::ExtraConfig::reservedCapacity( @@ -137,6 +130,26 @@ double SharedArbitrator::ExtraConfig::memoryPoolMinFreeCapacityPct( kDefaultMemoryPoolMinFreeCapacityPct); } +uint64_t SharedArbitrator::ExtraConfig::memoryPoolMinReclaimBytes( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kMemoryPoolMinReclaimBytes, + std::string(kDefaultMemoryPoolMinReclaimBytes)), + config::CapacityUnit::BYTE); +} + +uint64_t SharedArbitrator::ExtraConfig::memoryPoolAbortCapacityLimit( + const std::unordered_map& configs) { + return config::toCapacity( + getConfig( + configs, + kMemoryPoolAbortCapacityLimit, + std::string(kDefaultMemoryPoolAbortCapacityLimit)), + config::CapacityUnit::BYTE); +} + bool SharedArbitrator::ExtraConfig::globalArbitrationEnabled( const std::unordered_map& configs) { return getConfig( @@ -164,62 +177,164 @@ double SharedArbitrator::ExtraConfig::slowCapacityGrowPct( configs, kSlowCapacityGrowPct, kDefaultSlowCapacityGrowPct); } +uint32_t SharedArbitrator::ExtraConfig::globalArbitrationMemoryReclaimPct( + const std::unordered_map& configs) { + return getConfig( + configs, + kGlobalArbitrationMemoryReclaimPct, + kDefaultGlobalMemoryArbitrationReclaimPct); +} + +double +SharedArbitrator::ExtraConfig::globalArbitrationReclaimThreadsHwMultiplier( + const std::unordered_map& configs) { + return getConfig( + configs, + kGlobalArbitrationReclaimThreadsHwMultiplier, + kDefaultGlobalArbitrationReclaimThreadsHwMultiplier); +} + SharedArbitrator::SharedArbitrator(const Config& config) : MemoryArbitrator(config), reservedCapacity_(ExtraConfig::reservedCapacity(config.extraConfigs)), - memoryPoolInitialCapacity_( - ExtraConfig::memoryPoolInitialCapacity(config.extraConfigs)), - memoryPoolReservedCapacity_( - ExtraConfig::memoryPoolReservedCapacity(config.extraConfigs)), - memoryReclaimWaitMs_( + maxArbitrationTimeMs_( ExtraConfig::memoryReclaimMaxWaitTimeMs(config.extraConfigs)), + participantConfig_( + ExtraConfig::memoryPoolInitialCapacity(config.extraConfigs), + ExtraConfig::memoryPoolReservedCapacity(config.extraConfigs), + ExtraConfig::fastExponentialGrowthCapacityLimitBytes( + config.extraConfigs), + ExtraConfig::slowCapacityGrowPct(config.extraConfigs), + ExtraConfig::memoryPoolMinFreeCapacity(config.extraConfigs), + ExtraConfig::memoryPoolMinFreeCapacityPct(config.extraConfigs), + ExtraConfig::memoryPoolMinReclaimBytes(config.extraConfigs), + ExtraConfig::memoryPoolAbortCapacityLimit(config.extraConfigs)), globalArbitrationEnabled_( ExtraConfig::globalArbitrationEnabled(config.extraConfigs)), - checkUsageLeak_(ExtraConfig::checkUsageLeak(config.extraConfigs)), - fastExponentialGrowthCapacityLimit_( - ExtraConfig::fastExponentialGrowthCapacityLimitBytes( + globalArbitrationMemoryReclaimPct_( + ExtraConfig::globalArbitrationMemoryReclaimPct(config.extraConfigs)), + globalArbitrationReclaimThreadsHwMultiplier_( + ExtraConfig::globalArbitrationReclaimThreadsHwMultiplier( config.extraConfigs)), - slowCapacityGrowPct_( - ExtraConfig::slowCapacityGrowPct(config.extraConfigs)), - memoryPoolMinFreeCapacity_( - ExtraConfig::memoryPoolMinFreeCapacity(config.extraConfigs)), - memoryPoolMinFreeCapacityPct_( - ExtraConfig::memoryPoolMinFreeCapacityPct(config.extraConfigs)), + checkUsageLeak_(ExtraConfig::checkUsageLeak(config.extraConfigs)), freeReservedCapacity_(reservedCapacity_), freeNonReservedCapacity_(capacity_ - freeReservedCapacity_) { VELOX_CHECK_EQ(kind_, config.kind); VELOX_CHECK_LE(reservedCapacity_, capacity_); - VELOX_CHECK_GE(slowCapacityGrowPct_, 0); - VELOX_CHECK_GE(memoryPoolMinFreeCapacityPct_, 0); - VELOX_CHECK_LE(memoryPoolMinFreeCapacityPct_, 1); - VELOX_CHECK_EQ( - fastExponentialGrowthCapacityLimit_ == 0, - slowCapacityGrowPct_ == 0, - "fastExponentialGrowthCapacityLimit_ {} and slowCapacityGrowPct_ {} " - "both need to be set (non-zero) at the same time to enable growth capacity " - "adjustment.", - fastExponentialGrowthCapacityLimit_, - slowCapacityGrowPct_); - VELOX_CHECK_EQ( - memoryPoolMinFreeCapacity_ == 0, - memoryPoolMinFreeCapacityPct_ == 0, - "memoryPoolMinFreeCapacity_ {} and memoryPoolMinFreeCapacityPct_ {} both " - "need to be set (non-zero) at the same time to enable shrink capacity " - "adjustment.", - memoryPoolMinFreeCapacity_, - memoryPoolMinFreeCapacityPct_); + VELOX_CHECK_GT( + maxArbitrationTimeMs_, 0, "maxArbitrationTimeMs can't be zero"); + + VELOX_CHECK_LE( + globalArbitrationMemoryReclaimPct_, + 100, + "Invalid globalArbitrationMemoryReclaimPct"); + VELOX_CHECK_GT( + globalArbitrationReclaimThreadsHwMultiplier_, + 0.0, + "globalArbitrationReclaimThreadsHwMultiplier_ needs to be positive"); + + setupGlobalArbitration(); + + VELOX_MEM_LOG(INFO) << "Shared arbitrator created with " + << succinctBytes(capacity_) << " capacity, " + << succinctBytes(reservedCapacity_) + << " reserved capacity"; + if (globalArbitrationEnabled_) { + VELOX_MEM_LOG(INFO) << "Arbitration config: max arbitration time " + << succinctMillis(maxArbitrationTimeMs_) + << ", global memory reclaim percentage " + << globalArbitrationMemoryReclaimPct_ + << ", global memory reclaim executor hw multiplier " + << globalArbitrationReclaimThreadsHwMultiplier_; + } + VELOX_MEM_LOG(INFO) << "Memory pool participant config: " + << participantConfig_.toString(); } -std::string SharedArbitrator::Candidate::toString() const { - return fmt::format( - "CANDIDATE[{}] RECLAIMABLE_BYTES[{}] FREE_BYTES[{}]]", - pool->name(), - succinctBytes(reclaimableBytes), - succinctBytes(freeBytes)); +void SharedArbitrator::setupGlobalArbitration() { + if (!globalArbitrationEnabled_) { + return; + } + VELOX_CHECK_NULL(globalArbitrationController_); + VELOX_CHECK_NULL(globalArbitrationExecutor_); + + const uint64_t minAbortCapacity = 32 << 20; + for (auto abortLimit = participantConfig_.abortCapacityLimit; abortLimit >= + std::max(minAbortCapacity, + folly::nextPowTwo(participantConfig_.minCapacity)); + abortLimit /= 2) { + globalArbitrationAbortCapacityLimits_.push_back(abortLimit); + } + globalArbitrationAbortCapacityLimits_.push_back(0); + + VELOX_MEM_LOG(INFO) << "Global arbitration abort capacity limits: " + << folly::join( + ",", globalArbitrationAbortCapacityLimits_); + + const uint64_t numArbitrationThreads = std::max( + 1, + std::thread::hardware_concurrency() * + globalArbitrationReclaimThreadsHwMultiplier_); + globalArbitrationExecutor_ = std::make_unique( + numArbitrationThreads, + std::make_shared("GlobalArbitrationReclaim")); + VELOX_MEM_LOG(INFO) << "Start global arbitration executor with " + << numArbitrationThreads << " threads"; + + globalArbitrationController_ = std::make_unique([&]() { + pthread_setname_np(pthread_self(), "GlobalArbitrationController"); + globalArbitrationMain(); + }); +} + +void SharedArbitrator::shutdownGlobalArbitration() { + if (!globalArbitrationEnabled_) { + VELOX_CHECK_NULL(globalArbitrationController_); + VELOX_CHECK_NULL(globalArbitrationExecutor_); + return; + } + + VELOX_CHECK(!globalArbitrationAbortCapacityLimits_.empty()); + VELOX_CHECK_NOT_NULL(globalArbitrationController_); + VELOX_CHECK_NOT_NULL(globalArbitrationExecutor_); + { + std::lock_guard l(stateLock_); + // We only expect stop global arbitration once during velox runtime + // shutdown. + VELOX_CHECK(!globalArbitrationStop_); + VELOX_CHECK(globalArbitrationWaiters_.empty()); + globalArbitrationStop_ = true; + } + + VELOX_MEM_LOG(INFO) << "Stopping global arbitration controller"; + globalArbitrationThreadCv_.notify_one(); + globalArbitrationController_->join(); + globalArbitrationController_.reset(); + VELOX_MEM_LOG(INFO) << "Global arbitration controller stopped"; + + VELOX_MEM_LOG(INFO) << "Stopping global arbitration executor '" + << globalArbitrationExecutor_->getName() << "': threads: " + << globalArbitrationExecutor_->numActiveThreads() << "/" + << globalArbitrationExecutor_->numThreads() + << ", task queue: " + << globalArbitrationExecutor_->getTaskQueueSize(); + globalArbitrationExecutor_.reset(); + VELOX_MEM_LOG(INFO) << "Global arbitration executor stopped"; +} + +void SharedArbitrator::wakeupGlobalArbitrationThread() { + VELOX_CHECK(globalArbitrationEnabled_); + VELOX_CHECK_NOT_NULL(globalArbitrationController_); + incrementGlobalArbitrationWaitCount(); + globalArbitrationThreadCv_.notify_one(); } SharedArbitrator::~SharedArbitrator() { - VELOX_CHECK(candidates_.empty()); + shutdownGlobalArbitration(); + + VELOX_CHECK_EQ( + participants_.size(), 0, "Unexpected alive participants on destruction"); + if (freeNonReservedCapacity_ + freeReservedCapacity_ != capacity_) { const std::string errMsg = fmt::format( "Unexpected free capacity leak in arbitrator: freeNonReservedCapacity_[{}] + freeReservedCapacity_[{}] != capacity_[{}])\\n{}", @@ -235,82 +350,147 @@ SharedArbitrator::~SharedArbitrator() { } } +void SharedArbitrator::startArbitration(ArbitrationOperation* op) { + updateArbitrationRequestStats(); + ++numRunning_; + op->start(); +} + +void SharedArbitrator::finishArbitration(ArbitrationOperation* op) { + VELOX_CHECK_GT(numRunning_, 0); + --numRunning_; + op->finish(); + + const auto stats = op->stats(); + if (stats.executionTimeMs != 0) { + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorOpExecTimeMs, stats.executionTimeMs); + addThreadLocalRuntimeStat( + kMemoryArbitrationWallNanos, + RuntimeCounter( + stats.executionTimeMs * 1'000 * 1'000, + RuntimeCounter::Unit::kNanos)); + } + + if (stats.localArbitrationWaitTimeMs != 0) { + addThreadLocalRuntimeStat( + kLocalArbitrationWaitWallNanos, + RuntimeCounter( + stats.localArbitrationWaitTimeMs * 1'000 * 1'000, + RuntimeCounter::Unit::kNanos)); + } + if (stats.localArbitrationExecTimeMs != 0) { + addThreadLocalRuntimeStat( + kLocalArbitrationExecutionWallNanos, + RuntimeCounter( + stats.localArbitrationExecTimeMs * 1'000 * 1'000, + RuntimeCounter::Unit::kNanos)); + } + if (stats.globalArbitrationWaitTimeMs != 0) { + addThreadLocalRuntimeStat( + kGlobalArbitrationWaitWallNanos, + RuntimeCounter( + stats.globalArbitrationWaitTimeMs * 1'000 * 1'000, + RuntimeCounter::Unit::kNanos)); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorGlobalArbitrationWaitTimeMs, + stats.globalArbitrationWaitTimeMs); + } +} + void SharedArbitrator::addPool(const std::shared_ptr& pool) { VELOX_CHECK_EQ(pool->capacity(), 0); + + auto newParticipant = ArbitrationParticipant::create( + nextParticipantId_++, pool, &participantConfig_); { - std::unique_lock guard{poolLock_}; - VELOX_CHECK_EQ(candidates_.count(pool.get()), 0); - candidates_.emplace(pool.get(), pool); + std::unique_lock guard{participantLock_}; + VELOX_CHECK_EQ( + participants_.count(pool->name()), + 0, + "Memory pool {} already exists", + pool->name()); + participants_.emplace(newParticipant->name(), newParticipant); } - std::lock_guard l(stateLock_); - const uint64_t maxBytesToReserve = - std::min(maxGrowCapacity(*pool), memoryPoolInitialCapacity_); - const uint64_t minBytesToReserve = minGrowCapacity(*pool); - const uint64_t reservedBytes = - decrementFreeCapacityLocked(maxBytesToReserve, minBytesToReserve); - try { - checkedGrow(pool.get(), reservedBytes, 0); - } catch (const VeloxRuntimeError&) { - incrementFreeCapacityLocked(reservedBytes); + auto scopedParticipant = newParticipant->lock().value(); + std::vector arbitrationWaiters; + { + std::lock_guard l(stateLock_); + const uint64_t minBytesToReserve = std::min( + scopedParticipant->maxCapacity(), scopedParticipant->minCapacity()); + const uint64_t maxBytesToReserve = std::max( + minBytesToReserve, + std::min( + scopedParticipant->maxCapacity(), participantConfig_.initCapacity)); + const uint64_t allocatedBytes = allocateCapacityLocked( + scopedParticipant->id(), 0, maxBytesToReserve, minBytesToReserve); + if (allocatedBytes > 0) { + VELOX_CHECK_LE(allocatedBytes, maxBytesToReserve); + try { + checkedGrow(scopedParticipant, allocatedBytes, 0); + } catch (const VeloxRuntimeError& e) { + VELOX_MEM_LOG(ERROR) + << "Failed to allocate initial capacity " + << succinctBytes(allocatedBytes) + << " for memory pool: " << scopedParticipant->name() << "\n" + << e.what(); + freeCapacityLocked(allocatedBytes, arbitrationWaiters); + } + } + } + for (auto& waiter : arbitrationWaiters) { + waiter.setValue(); } } void SharedArbitrator::removePool(MemoryPool* pool) { VELOX_CHECK_EQ(pool->reservedBytes(), 0); - shrinkCapacity(pool); + const uint64_t freedBytes = shrinkPool(pool, 0); + VELOX_CHECK_EQ(pool->capacity(), 0); + freeCapacity(freedBytes); - std::unique_lock guard{poolLock_}; - const auto ret = candidates_.erase(pool); + std::unique_lock guard{participantLock_}; + const auto ret = participants_.erase(pool->name()); VELOX_CHECK_EQ(ret, 1); } -void SharedArbitrator::getCandidates( - ArbitrationOperation* op, +std::vector SharedArbitrator::getCandidates( bool freeCapacityOnly) { - op->candidates.clear(); - - std::shared_lock guard{poolLock_}; - op->candidates.reserve(candidates_.size()); - for (const auto& candidate : candidates_) { - const bool selfCandidate = op->requestPool == candidate.first; - std::shared_ptr pool = candidate.second.lock(); - if (pool == nullptr) { - VELOX_CHECK(!selfCandidate); + std::vector candidates; + std::shared_lock guard{participantLock_}; + candidates.reserve(participants_.size()); + for (const auto& entry : participants_) { + auto candidate = entry.second->lock(); + if (!candidate.has_value()) { continue; } - op->candidates.push_back( - {pool, - freeCapacityOnly ? 0 : reclaimableUsedCapacity(*pool, selfCandidate), - reclaimableFreeCapacity(*pool, selfCandidate), - pool->reservedBytes()}); + candidates.push_back({std::move(candidate.value()), freeCapacityOnly}); } - VELOX_CHECK(!op->candidates.empty()); + VELOX_CHECK(!candidates.empty()); + return candidates; } void SharedArbitrator::sortCandidatesByReclaimableFreeCapacity( - std::vector& candidates) { + std::vector& candidates) { std::sort( candidates.begin(), candidates.end(), - [&](const SharedArbitrator::Candidate& lhs, - const SharedArbitrator::Candidate& rhs) { - return lhs.freeBytes > rhs.freeBytes; + [&](const ArbitrationCandidate& lhs, const ArbitrationCandidate& rhs) { + return lhs.reclaimableFreeCapacity > rhs.reclaimableFreeCapacity; }); - TestValue::adjust( "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", &candidates); } void SharedArbitrator::sortCandidatesByReclaimableUsedCapacity( - std::vector& candidates) { + std::vector& candidates) { std::sort( candidates.begin(), candidates.end(), - [](const SharedArbitrator::Candidate& lhs, - const SharedArbitrator::Candidate& rhs) { - return lhs.reclaimableBytes > rhs.reclaimableBytes; + [](const ArbitrationCandidate& lhs, const ArbitrationCandidate& rhs) { + return lhs.reclaimableUsedCapacity > rhs.reclaimableUsedCapacity; }); TestValue::adjust( @@ -318,51 +498,57 @@ void SharedArbitrator::sortCandidatesByReclaimableUsedCapacity( &candidates); } -void SharedArbitrator::sortCandidatesByUsage( - std::vector& candidates) { - std::sort( - candidates.begin(), - candidates.end(), - [](const SharedArbitrator::Candidate& lhs, - const SharedArbitrator::Candidate& rhs) { - return lhs.reservedBytes > rhs.reservedBytes; - }); -} - -const SharedArbitrator::Candidate& -SharedArbitrator::findCandidateWithLargestCapacity( - MemoryPool* requestor, - uint64_t targetBytes, - const std::vector& candidates) { +std::optional SharedArbitrator::findAbortCandidate( + bool force) { + const auto candidates = getCandidates(); VELOX_CHECK(!candidates.empty()); - int32_t candidateIdx{-1}; - uint64_t maxCapacity{0}; - for (int32_t i = 0; i < candidates.size(); ++i) { - const bool isCandidate = candidates[i].pool.get() == requestor; - // For capacity comparison, the requestor's capacity should include both its - // current capacity and the capacity growth. - const uint64_t capacity = - candidates[i].pool->capacity() + (isCandidate ? targetBytes : 0); - if (i == 0) { - candidateIdx = 0; - maxCapacity = capacity; - continue; + for (uint64_t capacityLimit : globalArbitrationAbortCapacityLimits_) { + int32_t candidateIdx{-1}; + for (int32_t i = 0; i < candidates.size(); ++i) { + if (candidates[i].participant->aborted()) { + continue; + } + if (candidates[i].currentCapacity < capacityLimit || + candidates[i].currentCapacity == 0) { + continue; + } + if (candidateIdx == -1) { + candidateIdx = i; + continue; + } + // With the same capacity size bucket, we favor the old participant to let + // long running query proceed first. + if (candidates[candidateIdx].participant->id() < + candidates[i].participant->id()) { + candidateIdx = i; + } } - if (capacity < maxCapacity) { - continue; + if (candidateIdx != -1) { + return candidates[candidateIdx]; } - if (capacity > maxCapacity) { + } + + if (!force) { + VELOX_MEM_LOG(WARNING) << "Can't find an eligible abort victim"; + return std::nullopt; + } + + // Can't find an eligible abort candidate and then return the youngest + // candidate which has the largest participant id. + int32_t candidateIdx{-1}; + for (auto i = 0; i < candidates.size(); ++i) { + if (candidateIdx == -1) { candidateIdx = i; - maxCapacity = capacity; - continue; - } - // With the same amount of capacity, we prefer to kill the requestor itself - // without affecting the other query. - if (isCandidate) { + } else if ( + candidates[i].participant->id() > + candidates[candidateIdx].participant->id()) { candidateIdx = i; } } VELOX_CHECK_NE(candidateIdx, -1); + VELOX_MEM_LOG(WARNING) + << "Can't find an eligible abort victim and force to abort the youngest participant " + << candidates[candidateIdx].participant->name(); return candidates[candidateIdx]; } @@ -376,631 +562,710 @@ void SharedArbitrator::updateArbitrationFailureStats() { ++numFailures_; } -int64_t SharedArbitrator::maxReclaimableCapacity( - const MemoryPool& pool, - bool isSelfReclaim) const { - // Checks if a query memory pool has likely finished processing. It is likely - // this pool has finished when it has 0 current usage and non-0 past usage. If - // there is a high chance this pool finished, then we don't have to respect - // the memory pool reserved capacity limit check. - // - // NOTE: for query system like Prestissimo, it holds a finished query state in - // minutes for query stats fetch request from the Presto coordinator. - if (isSelfReclaim || (pool.reservedBytes() == 0 && pool.peakBytes() != 0)) { - return pool.capacity(); - } - return std::max(0, pool.capacity() - memoryPoolReservedCapacity_); +uint64_t SharedArbitrator::allocateCapacity( + uint64_t participantId, + uint64_t requestBytes, + uint64_t maxAllocateBytes, + uint64_t minAllocateBytes) { + std::lock_guard l(stateLock_); + return allocateCapacityLocked( + participantId, requestBytes, maxAllocateBytes, minAllocateBytes); } -int64_t SharedArbitrator::reclaimableFreeCapacity( - const MemoryPool& pool, - bool isSelfReclaim) const { - const auto freeBytes = pool.freeBytes(); - if (freeBytes == 0) { - return 0; +uint64_t SharedArbitrator::allocateCapacityLocked( + uint64_t participantId, + uint64_t requestBytes, + uint64_t maxAllocateBytes, + uint64_t minAllocateBytes) { + VELOX_CHECK_LE(requestBytes, maxAllocateBytes); + + if (FOLLY_UNLIKELY(!globalArbitrationWaiters_.empty())) { + if ((participantId > globalArbitrationWaiters_.begin()->first) && + (requestBytes > minAllocateBytes)) { + return 0; + } + maxAllocateBytes = std::max(requestBytes, minAllocateBytes); } - return std::min( - isSelfReclaim ? freeBytes : getCapacityShrinkTarget(pool, freeBytes), - maxReclaimableCapacity(pool, isSelfReclaim)); -} - -int64_t SharedArbitrator::reclaimableUsedCapacity( - const MemoryPool& pool, - bool isSelfReclaim) const { - const auto maxReclaimableBytes = maxReclaimableCapacity(pool, isSelfReclaim); - const auto reclaimableBytes = pool.reclaimableBytes(); - return std::min(maxReclaimableBytes, reclaimableBytes.value_or(0)); -} -int64_t SharedArbitrator::minGrowCapacity(const MemoryPool& pool) const { - return std::max( - 0, - std::min(pool.maxCapacity(), memoryPoolReservedCapacity_) - - pool.capacity()); -} + const uint64_t nonReservedBytes = + std::min(freeNonReservedCapacity_, maxAllocateBytes); + if (nonReservedBytes >= maxAllocateBytes) { + freeNonReservedCapacity_ -= nonReservedBytes; + return nonReservedBytes; + } -uint64_t SharedArbitrator::decrementFreeCapacity( - uint64_t maxBytesToReserve, - uint64_t minBytesToReserve) { uint64_t reservedBytes{0}; - { - std::lock_guard l(stateLock_); + if (nonReservedBytes < minAllocateBytes) { + const uint64_t freeReservedCapacity = freeReservedCapacity_; reservedBytes = - decrementFreeCapacityLocked(maxBytesToReserve, minBytesToReserve); - } - return reservedBytes; -} - -uint64_t SharedArbitrator::decrementFreeCapacityLocked( - uint64_t maxBytesToReserve, - uint64_t minBytesToReserve) { - uint64_t allocatedBytes = - std::min(freeNonReservedCapacity_, maxBytesToReserve); - freeNonReservedCapacity_ -= allocatedBytes; - if (allocatedBytes < minBytesToReserve) { - const uint64_t reservedBytes = std::min( - minBytesToReserve - allocatedBytes, freeReservedCapacity_); - freeReservedCapacity_ -= reservedBytes; - allocatedBytes += reservedBytes; + std::min(minAllocateBytes - nonReservedBytes, freeReservedCapacity); } - return allocatedBytes; -} - -uint64_t SharedArbitrator::getCapacityShrinkTarget( - const MemoryPool& pool, - uint64_t requestBytes) const { - VELOX_CHECK_NE(requestBytes, 0); - auto targetBytes = requestBytes; - if (memoryPoolMinFreeCapacity_ != 0) { - const auto minFreeBytes = std::min( - static_cast(pool.capacity() * memoryPoolMinFreeCapacityPct_), - memoryPoolMinFreeCapacity_); - const auto maxShrinkBytes = std::max( - 0LL, pool.freeBytes() - static_cast(minFreeBytes)); - targetBytes = std::min(targetBytes, static_cast(maxShrinkBytes)); + if (FOLLY_UNLIKELY(nonReservedBytes + reservedBytes < requestBytes)) { + return 0; } - return targetBytes; + + freeNonReservedCapacity_ -= nonReservedBytes; + freeReservedCapacity_ -= reservedBytes; + return nonReservedBytes + reservedBytes; } uint64_t SharedArbitrator::shrinkCapacity( MemoryPool* pool, - uint64_t requestBytes) { - std::lock_guard l(stateLock_); - const uint64_t freedBytes = shrinkPool( - pool, - requestBytes == 0 ? 0 : getCapacityShrinkTarget(*pool, requestBytes)); - incrementFreeCapacityLocked(freedBytes); - return freedBytes; + uint64_t /*unused*/) { + VELOX_CHECK(pool->isRoot()); + auto participant = getParticipant(pool->name()); + VELOX_CHECK(participant.has_value()); + return shrink(participant.value(), /*reclaimAll=*/true); } uint64_t SharedArbitrator::shrinkCapacity( uint64_t requestBytes, bool allowSpill, bool allowAbort) { - incrementGlobalArbitrationCount(); const uint64_t targetBytes = requestBytes == 0 ? capacity_ : requestBytes; - ArbitrationOperation op(targetBytes); - ScopedArbitration scopedArbitration(this, &op); + ScopedMemoryArbitrationContext abitrationCtx{}; + const uint64_t startTimeMs = getCurrentTimeMs(); - std::lock_guard exclusiveLock(arbitrationLock_); - getCandidates(&op); - - uint64_t reclaimedBytes{0}; + uint64_t totalReclaimedBytes{0}; if (allowSpill) { - uint64_t freedBytes{0}; - reclaimUsedMemoryFromCandidatesBySpill(&op, freedBytes); - reclaimedBytes += freedBytes; - if (freedBytes > 0) { - incrementFreeCapacity(freedBytes); - } - if (reclaimedBytes >= op.requestBytes) { - return reclaimedBytes; - } - if (allowAbort) { - // Candidate stats may change after spilling. - getCandidates(&op); - } + totalReclaimedBytes += reclaimUsedMemoryBySpill(targetBytes); } - if (allowAbort) { - uint64_t freedBytes{0}; - reclaimUsedMemoryFromCandidatesByAbort(&op, freedBytes); - reclaimedBytes += freedBytes; - if (freedBytes > 0) { - incrementFreeCapacity(freedBytes); + if ((totalReclaimedBytes < targetBytes) && allowAbort) { + for (;;) { + const uint64_t reclaimedBytes = reclaimUsedMemoryByAbort(/*force=*/false); + if (reclaimedBytes == 0) { + break; + } + totalReclaimedBytes += reclaimedBytes; + if (totalReclaimedBytes >= targetBytes) { + break; + } } } - return reclaimedBytes; -} -void SharedArbitrator::testingFreeCapacity(uint64_t capacity) { - std::lock_guard l(stateLock_); - incrementFreeCapacityLocked(capacity); + const uint64_t reclaimTimeMs = getCurrentTimeMs() - startTimeMs; + VELOX_MEM_LOG(INFO) << "External shrink reclaimed " + << succinctBytes(totalReclaimedBytes) << ", spent " + << succinctMillis(reclaimTimeMs) << ", spill " + << (allowSpill ? "allowed" : "not allowed") << ", abort " + << (allowSpill ? "allowed" : "not allowed"); + updateGlobalArbitrationStats(reclaimTimeMs, totalReclaimedBytes); + return totalReclaimedBytes; } -uint64_t SharedArbitrator::testingNumRequests() const { - return numRequests_; -} +ArbitrationOperation SharedArbitrator::createArbitrationOperation( + MemoryPool* pool, + uint64_t requestBytes) { + VELOX_CHECK_NOT_NULL(pool); + VELOX_CHECK(pool->isRoot()); -uint64_t SharedArbitrator::getCapacityGrowthTarget( - const MemoryPool& pool, - uint64_t requestBytes) const { - if (fastExponentialGrowthCapacityLimit_ == 0 && slowCapacityGrowPct_ == 0) { - return requestBytes; - } - uint64_t targetBytes{0}; - const auto capacity = pool.capacity(); - if (capacity * 2 <= fastExponentialGrowthCapacityLimit_) { - targetBytes = capacity; - } else { - targetBytes = capacity * slowCapacityGrowPct_; - } - return std::max(requestBytes, targetBytes); + auto participant = getParticipant(pool->name()); + VELOX_CHECK(participant.has_value()); + return ArbitrationOperation( + std::move(participant.value()), requestBytes, maxArbitrationTimeMs_); } bool SharedArbitrator::growCapacity(MemoryPool* pool, uint64_t requestBytes) { - // NOTE: we shouldn't trigger the recursive memory capacity growth under - // memory arbitration context. - VELOX_CHECK(!underMemoryArbitration()); - - ArbitrationOperation op( - pool, requestBytes, getCapacityGrowthTarget(*pool, requestBytes)); + VELOX_CHECK(pool->isRoot()); + auto op = createArbitrationOperation(pool, requestBytes); ScopedArbitration scopedArbitration(this, &op); - bool needGlobalArbitration{false}; - if (!runLocalArbitration(&op, needGlobalArbitration)) { - return false; - } - if (!needGlobalArbitration) { - return true; - } - if (!globalArbitrationEnabled_) { - return false; + try { + const bool ret = growCapacity(op); + if (!ret) { + updateArbitrationFailureStats(); + } + return ret; + } catch (const std::exception&) { + updateArbitrationFailureStats(); + std::rethrow_exception(std::current_exception()); } - return runGlobalArbitration(&op); } -bool SharedArbitrator::runLocalArbitration( - ArbitrationOperation* op, - bool& needGlobalArbitration) { - needGlobalArbitration = false; - const std::chrono::steady_clock::time_point localArbitrationStartTime = - std::chrono::steady_clock::now(); - std::shared_lock sharedLock(arbitrationLock_); +bool SharedArbitrator::growCapacity(ArbitrationOperation& op) { TestValue::adjust( - "facebook::velox::memory::SharedArbitrator::runLocalArbitration", this); - op->localArbitrationLockWaitTimeUs = - std::chrono::duration_cast( - std::chrono::steady_clock::now() - localArbitrationStartTime) - .count(); - + "facebook::velox::memory::SharedArbitrator::growCapacity", this); checkIfAborted(op); + checkIfTimeout(op); - if (maybeGrowFromSelf(op)) { - return true; - } + RETURN_IF_TRUE(maybeGrowFromSelf(op)); if (!ensureCapacity(op)) { - updateArbitrationFailureStats(); - VELOX_MEM_LOG(ERROR) << "Can't grow " << op->requestPool->name() - << " capacity to " - << succinctBytes( - op->requestPool->capacity() + op->requestBytes) + VELOX_MEM_LOG(ERROR) << "Can't grow " << op.participant()->name() + << " capacity with " + << succinctBytes(op.requestBytes()) << " which exceeds its max capacity " - << succinctBytes(op->requestPool->maxCapacity()) + << succinctBytes(op.participant()->maxCapacity()) << ", current capacity " - << succinctBytes(op->requestPool->capacity()) - << ", request " << succinctBytes(op->requestBytes); + << succinctBytes(op.participant()->capacity()); return false; } - VELOX_CHECK(!op->requestPool->aborted()); - - if (maybeGrowFromSelf(op)) { - return true; + checkIfAborted(op); + checkIfTimeout(op); + + RETURN_IF_TRUE(maybeGrowFromSelf(op)); + + op.setGrowTargets(); + RETURN_IF_TRUE(growWithFreeCapacity(op)); + + reclaimUnusedCapacity(); + RETURN_IF_TRUE(growWithFreeCapacity(op)); + + if (!globalArbitrationEnabled_ && + op.participant()->reclaimableUsedCapacity() >= + participantConfig_.minReclaimBytes) { + // NOTE: if global memory arbitration is not enabled, we will try to reclaim + // from the participant itself before failing this operation. + reclaim( + op.participant(), + op.requestBytes(), + op.timeoutMs(), + /*localArbitration=*/true); + checkIfAborted(op); + RETURN_IF_TRUE(maybeGrowFromSelf(op)); + return growWithFreeCapacity(op); } + return startAndWaitGlobalArbitration(op); +} - uint64_t maxGrowTarget{0}; - uint64_t minGrowTarget{0}; - getGrowTargets(op, maxGrowTarget, minGrowTarget); +bool SharedArbitrator::startAndWaitGlobalArbitration(ArbitrationOperation& op) { + VELOX_CHECK(globalArbitrationEnabled_); + checkIfTimeout(op); - uint64_t freedBytes = decrementFreeCapacity(maxGrowTarget, minGrowTarget); - auto freeGuard = folly::makeGuard([&]() { - // Returns the unused freed memory capacity back to the arbitrator. - if (freedBytes > 0) { - incrementFreeCapacity(freedBytes); + std::unique_ptr arbitrationWait; + ContinueFuture arbitrationWaitFuture{ContinueFuture::makeEmpty()}; + uint64_t allocatedBytes{0}; + { + std::lock_guard l(stateLock_); + allocatedBytes = allocateCapacityLocked( + op.participant()->id(), + op.requestBytes(), + op.maxGrowBytes(), + op.minGrowBytes()); + if (allocatedBytes > 0) { + VELOX_CHECK_GE(allocatedBytes, op.requestBytes()); + } else { + arbitrationWait = std::make_unique( + &op, + ContinuePromise{fmt::format( + "{} wait for memory arbitration with {} request bytes", + op.participant()->name(), + succinctBytes(op.requestBytes()))}); + arbitrationWaitFuture = arbitrationWait->resumePromise.getSemiFuture(); + globalArbitrationWaiters_.emplace( + op.participant()->id(), arbitrationWait.get()); } - }); - if (freedBytes >= op->requestBytes) { - checkedGrow(op->requestPool, freedBytes, op->requestBytes); - freedBytes = 0; - return true; } - VELOX_CHECK_LT(freedBytes, maxGrowTarget); - - getCandidates(op, /*freeCapacityOnly=*/true); - freedBytes += - reclaimFreeMemoryFromCandidates(op, maxGrowTarget - freedBytes, true); - if (freedBytes >= op->requestBytes) { - const uint64_t bytesToGrow = std::min(maxGrowTarget, freedBytes); - checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); - freedBytes -= bytesToGrow; - return true; - } - VELOX_CHECK_LT(freedBytes, maxGrowTarget); - if (!globalArbitrationEnabled_) { - freedBytes += reclaim(op->requestPool, maxGrowTarget - freedBytes, true); - } - checkIfAborted(op); + TestValue::adjust( + "facebook::velox::memory::SharedArbitrator::startAndWaitGlobalArbitration", + this); + + if (arbitrationWaitFuture.valid()) { + VELOX_CHECK_NOT_NULL(arbitrationWait); + op.recordGlobalArbitrationStartTime(); + wakeupGlobalArbitrationThread(); + + const bool timeout = !std::move(arbitrationWaitFuture) + .wait(std::chrono::milliseconds(op.timeoutMs())); + if (timeout) { + VELOX_MEM_LOG(ERROR) + << op.participant()->name() + << " wait for memory arbitration timed out after running " + << succinctMillis(op.executionTimeMs()); + removeGlobalArbitrationWaiter(op.participant()->id()); + } - if (freedBytes >= op->requestBytes) { - const uint64_t bytesToGrow = std::min(maxGrowTarget, freedBytes); - checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); - freedBytes -= bytesToGrow; - return true; + allocatedBytes = arbitrationWait->allocatedBytes; + if (allocatedBytes == 0) { + checkIfAborted(op); + checkIfTimeout(op); + return false; + } } - - needGlobalArbitration = true; + VELOX_CHECK_GE(allocatedBytes, op.requestBytes()); + CHECKED_GROW(op.participant(), allocatedBytes, op.requestBytes()); return true; } -bool SharedArbitrator::runGlobalArbitration(ArbitrationOperation* op) { - incrementGlobalArbitrationCount(); - const std::chrono::steady_clock::time_point globalArbitrationStartTime = - std::chrono::steady_clock::now(); - std::lock_guard exclusiveLock(arbitrationLock_); +void SharedArbitrator::updateGlobalArbitrationStats( + uint64_t arbitrationTimeMs, + uint64_t arbitrationBytes) { + globalArbitrationTimeMs_ += arbitrationTimeMs; + ++globalArbitrationRuns_; + globalArbitrationBytes_ += arbitrationBytes; + RECORD_METRIC_VALUE(kMetricArbitratorGlobalArbitrationCount); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorGlobalArbitrationBytes, arbitrationBytes); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorGlobalArbitrationTimeMs, arbitrationTimeMs); +} + +void SharedArbitrator::globalArbitrationMain() { + VELOX_MEM_LOG(INFO) << "Global arbitration controller started"; + while (true) { + { + std::unique_lock l(stateLock_); + globalArbitrationThreadCv_.wait(l, [&] { + return globalArbitrationStop_ || !globalArbitrationWaiters_.empty(); + }); + if (globalArbitrationStop_) { + VELOX_CHECK(globalArbitrationWaiters_.empty()); + break; + } + } + GlobalArbitrationSection section{this}; + runGlobalArbitration(); + } + VELOX_MEM_LOG(INFO) << "Global arbitration controller stopped"; +} + +void SharedArbitrator::runGlobalArbitration() { TestValue::adjust( "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", this); - op->globalArbitrationLockWaitTimeUs = - std::chrono::duration_cast( - std::chrono::steady_clock::now() - globalArbitrationStartTime) - .count(); - checkIfAborted(op); - if (maybeGrowFromSelf(op)) { - return true; - } + const uint64_t startTimeMs = getCurrentTimeMs(); + uint64_t totalReclaimedBytes{0}; + bool reclaimByAbort{false}; + uint64_t reclaimedBytes{0}; + std::unordered_set reclaimedParticipants; + std::unordered_set failedParticipants; + bool allParticipantsReclaimed{false}; - int32_t attempts = 0; - for (;; ++attempts) { - if (arbitrateMemory(op)) { - return true; - } - if (attempts > 0) { - break; - } - VELOX_CHECK(!op->requestPool->aborted()); - if (!handleOOM(op)) { - break; + size_t round{0}; + for (;; ++round) { + uint64_t arbitrationTimeUs{0}; + { + MicrosecondTimer timer(&arbitrationTimeUs); + const uint64_t targetBytes = getGlobalArbitrationTarget(); + if (targetBytes == 0) { + break; + } + + // Check if we need to abort participant to reclaim used memory to + // accelerate global arbitration. + // + // TODO: make the time based condition check configurable. + reclaimByAbort = + (getCurrentTimeMs() - startTimeMs) < maxArbitrationTimeMs_ / 2 && + (reclaimByAbort || (allParticipantsReclaimed && reclaimedBytes == 0)); + if (!reclaimByAbort) { + reclaimedBytes = reclaimUsedMemoryBySpill( + targetBytes, + reclaimedParticipants, + failedParticipants, + allParticipantsReclaimed); + } else { + reclaimedBytes = reclaimUsedMemoryByAbort(/*force=*/true); + } + totalReclaimedBytes += reclaimedBytes; + reclaimUnusedCapacity(); } + + updateGlobalArbitrationStats(arbitrationTimeUs / 1'000, reclaimedBytes); } - VELOX_MEM_LOG(ERROR) - << "Failed to arbitrate sufficient memory for memory pool " - << op->requestPool->name() << ", request " - << succinctBytes(op->requestBytes) << " after " << attempts - << " attempts, Arbitrator state: " << toString(); - updateArbitrationFailureStats(); - return false; + VELOX_MEM_LOG(INFO) << "Global arbitration reclaimed " + << succinctBytes(totalReclaimedBytes) << " " + << reclaimedParticipants.size() << " victims, spent " + << succinctMillis(getCurrentTimeMs() - startTimeMs) + << " with " << round << " rounds"; +} + +uint64_t SharedArbitrator::getGlobalArbitrationTarget() { + uint64_t targetBytes{0}; + std::lock_guard l(stateLock_); + for (const auto& waiter : globalArbitrationWaiters_) { + targetBytes += waiter.second->op->maxGrowBytes(); + } + if (targetBytes == 0) { + return 0; + } + return std::max( + capacity_ * globalArbitrationMemoryReclaimPct_ / 100, targetBytes); } void SharedArbitrator::getGrowTargets( - ArbitrationOperation* op, + ArbitrationOperation& op, uint64_t& maxGrowTarget, uint64_t& minGrowTarget) { - VELOX_CHECK(op->targetBytes.has_value()); - maxGrowTarget = - std::min(maxGrowCapacity(*op->requestPool), op->targetBytes.value()); - minGrowTarget = minGrowCapacity(*op->requestPool); + op.participant()->getGrowTargets( + op.requestBytes(), maxGrowTarget, minGrowTarget); } -void SharedArbitrator::checkIfAborted(ArbitrationOperation* op) { - if (op->requestPool->aborted()) { - updateArbitrationFailureStats(); - VELOX_MEM_POOL_ABORTED("The requestor pool has been aborted"); +void SharedArbitrator::checkIfAborted(ArbitrationOperation& op) { + if (op.participant()->aborted()) { + VELOX_MEM_POOL_ABORTED( + fmt::format("Memory pool {} aborted", op.participant()->name())); } } -bool SharedArbitrator::maybeGrowFromSelf(ArbitrationOperation* op) { - if (op->requestPool->freeBytes() >= op->requestBytes) { - if (growPool(op->requestPool, 0, op->requestBytes)) { - return true; - } +void SharedArbitrator::checkIfTimeout(ArbitrationOperation& op) { + if (FOLLY_UNLIKELY(op.hasTimeout())) { + VELOX_MEM_ARBITRATION_TIMEOUT(fmt::format( + "Memory arbitration timed out on memory pool: {} after running {}", + op.participant()->name(), + succinctMillis(op.executionTimeMs()))); } - return false; } -bool SharedArbitrator::checkCapacityGrowth(ArbitrationOperation* op) const { - return (maxGrowCapacity(*op->requestPool) >= op->requestBytes) && - (capacityAfterGrowth(*op->requestPool, op->requestBytes) <= capacity_); +bool SharedArbitrator::maybeGrowFromSelf(ArbitrationOperation& op) { + return op.participant()->grow(0, op.requestBytes()); } -bool SharedArbitrator::ensureCapacity(ArbitrationOperation* op) { - if ((op->requestBytes > capacity_) || - (op->requestBytes > op->requestPool->maxCapacity())) { - return false; - } - if (checkCapacityGrowth(op)) { +bool SharedArbitrator::growWithFreeCapacity(ArbitrationOperation& op) { + const uint64_t allocatedBytes = allocateCapacity( + op.participant()->id(), + op.requestBytes(), + op.maxGrowBytes(), + op.minGrowBytes()); + if (allocatedBytes > 0) { + VELOX_CHECK_GE(allocatedBytes, op.requestBytes()); + CHECKED_GROW(op.participant(), allocatedBytes, op.requestBytes()); return true; } + return false; +} - const uint64_t reclaimedBytes = - reclaim(op->requestPool, op->requestBytes, true); - // NOTE: return the reclaimed bytes back to the arbitrator and let the memory - // arbitration process to grow the requestor's memory capacity accordingly. - incrementFreeCapacity(reclaimedBytes); - // Check if the requestor has been aborted in reclaim operation above. - if (op->requestPool->aborted()) { - updateArbitrationFailureStats(); - VELOX_MEM_POOL_ABORTED("The requestor pool has been aborted"); - } - return checkCapacityGrowth(op); +std::optional SharedArbitrator::getParticipant( + const std::string& name) const { + std::shared_lock guard{participantLock_}; + auto it = participants_.find(name); + VELOX_CHECK(it != participants_.end(), "Arbitration pool {} not found", name); + return it->second->lock(); } -bool SharedArbitrator::handleOOM(ArbitrationOperation* op) { - MemoryPool* victim = findCandidateWithLargestCapacity( - op->requestPool, op->requestBytes, op->candidates) - .pool.get(); - if (op->requestPool == victim) { - VELOX_MEM_LOG(ERROR) - << "Requestor memory pool " << op->requestPool->name() - << " is selected as victim memory pool so fail the memory arbitration"; +bool SharedArbitrator::checkCapacityGrowth(ArbitrationOperation& op) const { + if (!op.participant()->checkCapacityGrowth(op.requestBytes())) { return false; } - VELOX_MEM_LOG(WARNING) << "Aborting victim memory pool " << victim->name() - << " to free up memory for requestor " - << op->requestPool->name(); - try { - if (victim == op->requestPool) { - VELOX_MEM_POOL_CAP_EXCEEDED( - memoryPoolAbortMessage(victim, op->requestPool, op->requestBytes)); - } else { - VELOX_MEM_POOL_ABORTED( - memoryPoolAbortMessage(victim, op->requestPool, op->requestBytes)); - } - } catch (VeloxRuntimeError&) { - abort(victim, std::current_exception()); - } - // Free up all the unused capacity from the aborted memory pool and gives back - // to the arbitrator. - incrementFreeCapacity(shrinkPool(victim, 0)); - return true; + return (op.participant()->capacity() + op.requestBytes()) <= capacity_; } -void SharedArbitrator::checkedGrow( - MemoryPool* pool, - uint64_t growBytes, - uint64_t reservationBytes) { - const auto ret = growPool(pool, growBytes, reservationBytes); - VELOX_CHECK( - ret, - "Failed to grow pool {} with {} and commit {} used reservation", - pool->name(), - succinctBytes(growBytes), - succinctBytes(reservationBytes)); -} - -bool SharedArbitrator::arbitrateMemory(ArbitrationOperation* op) { - VELOX_CHECK(!op->requestPool->aborted()); - uint64_t maxGrowTarget{0}; - uint64_t minGrowTarget{0}; - getGrowTargets(op, maxGrowTarget, minGrowTarget); - - uint64_t freedBytes = decrementFreeCapacity(maxGrowTarget, minGrowTarget); - auto freeGuard = folly::makeGuard([&]() { - // Returns the unused freed memory capacity back to the arbitrator. - if (freedBytes > 0) { - incrementFreeCapacity(freedBytes); - } - }); - if (freedBytes >= op->requestBytes) { - checkedGrow(op->requestPool, freedBytes, op->requestBytes); - freedBytes = 0; - return true; +bool SharedArbitrator::ensureCapacity(ArbitrationOperation& op) { + if ((op.requestBytes() > capacity_) || + (op.requestBytes() > op.participant()->maxCapacity())) { + return false; } - VELOX_CHECK_LT(freedBytes, maxGrowTarget); - // Get refreshed stats before the global memory arbitration run. - getCandidates(op); + RETURN_IF_TRUE(checkCapacityGrowth(op)); - freedBytes += - reclaimFreeMemoryFromCandidates(op, maxGrowTarget - freedBytes, false); - if (freedBytes >= op->requestBytes) { - const uint64_t bytesToGrow = std::min(maxGrowTarget, freedBytes); - checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); - freedBytes -= bytesToGrow; - return true; - } - VELOX_CHECK_LT(freedBytes, maxGrowTarget); + shrink(op.participant(), /*reclaimAll=*/true); + + RETURN_IF_TRUE(checkCapacityGrowth(op)); - reclaimUsedMemoryFromCandidatesBySpill(op, freedBytes); + reclaim( + op.participant(), + op.requestBytes(), + op.timeoutMs(), + /*localArbitration=*/true); + // Checks if the requestor has been aborted in reclaim above. checkIfAborted(op); - if (freedBytes < op->requestBytes) { - VELOX_MEM_LOG(WARNING) - << "Failed to arbitrate sufficient memory for memory pool " - << op->requestPool->name() << ", request " - << succinctBytes(op->requestBytes) << ", only " - << succinctBytes(freedBytes) - << " has been freed, Arbitrator state: " << toString(); - return false; - } + RETURN_IF_TRUE(checkCapacityGrowth(op)); - const uint64_t bytesToGrow = std::min(freedBytes, maxGrowTarget); - checkedGrow(op->requestPool, bytesToGrow, op->requestBytes); - freedBytes -= bytesToGrow; - return true; + shrink(op.participant(), /*reclaimAll=*/true); + return checkCapacityGrowth(op); } -uint64_t SharedArbitrator::reclaimFreeMemoryFromCandidates( - ArbitrationOperation* op, - uint64_t reclaimTargetBytes, - bool isLocalArbitration) { - // Sort candidate memory pools based on their reclaimable free capacity. - sortCandidatesByReclaimableFreeCapacity(op->candidates); +void SharedArbitrator::checkedGrow( + const ScopedArbitrationParticipant& participant, + uint64_t growBytes, + uint64_t reservationBytes) { + const auto ret = participant->grow(growBytes, reservationBytes); + if (!ret) { + VELOX_FAIL( + "Failed to grow memory pool {} with {} and commit {} used reservation, memory pool stats:\n{}\n{}", + participant->name(), + succinctBytes(growBytes), + succinctBytes(reservationBytes), + participant->pool()->toString(), + participant->pool()->treeMemoryUsage()); + } +} - std::lock_guard l(stateLock_); +uint64_t SharedArbitrator::reclaimUnusedCapacity() { + std::vector candidates = + getCandidates(/*freeCapacityOnly=*/true); uint64_t reclaimedBytes{0}; - for (const auto& candidate : op->candidates) { - VELOX_CHECK_LT(reclaimedBytes, reclaimTargetBytes); - if (candidate.freeBytes == 0) { - break; - } - if (isLocalArbitration && (candidate.pool.get() != op->requestPool) && - isUnderArbitrationLocked(candidate.pool.get())) { - // If the reclamation is for local arbitration and the candidate pool is - // also under arbitration processing, then we can't reclaim from the - // candidate pool as it might cause concurrent changes to the candidate - // pool's capacity. - continue; - } - const int64_t bytesToReclaim = std::min( - reclaimTargetBytes - reclaimedBytes, - reclaimableFreeCapacity( - *candidate.pool, candidate.pool.get() == op->requestPool)); - if (bytesToReclaim <= 0) { + SCOPE_EXIT { + freeCapacity(reclaimedBytes); + }; + for (const auto& candidate : candidates) { + if (candidate.reclaimableFreeCapacity == 0) { continue; } - reclaimedBytes += shrinkPool(candidate.pool.get(), bytesToReclaim); - if (reclaimedBytes >= reclaimTargetBytes) { - break; - } + reclaimedBytes += candidate.participant->shrink(/*reclaimAll=*/false); } reclaimedFreeBytes_ += reclaimedBytes; return reclaimedBytes; } -void SharedArbitrator::reclaimUsedMemoryFromCandidatesBySpill( - ArbitrationOperation* op, - uint64_t& freedBytes) { - // Sort candidate memory pools based on their reclaimable used capacity. - sortCandidatesByReclaimableUsedCapacity(op->candidates); +uint64_t SharedArbitrator::reclaimUsedMemoryBySpill(uint64_t targetBytes) { + std::unordered_set unusedReclaimedParticipants; + std::unordered_set failedParticipants; + bool unusedAllParticipantsReclaimed; + return reclaimUsedMemoryBySpill( + targetBytes, + unusedReclaimedParticipants, + failedParticipants, + unusedAllParticipantsReclaimed); +} - for (const auto& candidate : op->candidates) { - VELOX_CHECK_LT(freedBytes, op->requestBytes); - if (candidate.reclaimableBytes == 0) { +uint64_t SharedArbitrator::reclaimUsedMemoryBySpill( + uint64_t targetBytes, + std::unordered_set& reclaimedParticipants, + std::unordered_set& failedParticipants, + bool& allParticipantsReclaimed) { + allParticipantsReclaimed = true; + const uint64_t prevReclaimedBytes = reclaimedUsedBytes_; + auto candidates = getCandidates(); + sortCandidatesByReclaimableUsedCapacity(candidates); + + std::vector victims; + victims.reserve(candidates.size()); + uint64_t bytesToReclaim{0}; + for (auto& candidate : candidates) { + if (candidate.reclaimableUsedCapacity < + participantConfig_.minReclaimBytes) { break; } - freedBytes += - reclaim(candidate.pool.get(), op->requestBytes - freedBytes, false); - if ((freedBytes >= op->requestBytes) || - (op->requestPool != nullptr && op->requestPool->aborted())) { - break; + if (failedParticipants.count(candidate.participant->id()) != 0) { + VELOX_CHECK_EQ( + reclaimedParticipants.count(candidate.participant->id()), 1); + continue; + } + if (bytesToReclaim >= targetBytes) { + if (reclaimedParticipants.count(candidate.participant->id()) == 0) { + allParticipantsReclaimed = false; + } + continue; } + bytesToReclaim += candidate.reclaimableUsedCapacity; + reclaimedParticipants.insert(candidate.participant->id()); + victims.push_back(std::move(candidate)); + } + if (victims.empty()) { + FB_LOG_EVERY_MS(WARNING, 1'000) + << "No spill victim participant found with global arbitration target: " + << succinctBytes(targetBytes); + return 0; } -} - -void SharedArbitrator::reclaimUsedMemoryFromCandidatesByAbort( - ArbitrationOperation* op, - uint64_t& freedBytes) { - sortCandidatesByUsage(op->candidates); - for (const auto& candidate : op->candidates) { - VELOX_CHECK_LT(freedBytes, op->requestBytes); - if (candidate.pool->capacity() == 0) { - break; - } - try { - VELOX_MEM_POOL_ABORTED(fmt::format( - "Memory pool aborted to reclaim used memory, current usage {}, " - "memory pool details:\n{}\n{}", - succinctBytes(candidate.reservedBytes), - candidate.pool->toString(), - candidate.pool->treeMemoryUsage())); - } catch (VeloxRuntimeError&) { - abort(candidate.pool.get(), std::current_exception()); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricArbitratorGlobalArbitrationNumReclaimVictims, victims.size()); + + struct ReclaimResult { + uint64_t participantId{0}; + uint64_t reclaimedBytes{0}; + + explicit ReclaimResult(uint64_t _participantId, uint64_t _reclaimedBytes) + : participantId(_participantId), reclaimedBytes(_reclaimedBytes) {} + }; + std::vector>> reclaimTasks; + for (auto& victim : victims) { + reclaimTasks.push_back( + memory::createAsyncMemoryReclaimTask([this, victim]() { + const auto participant = victim.participant; + const uint64_t reclaimedBytes = reclaim( + participant, + victim.reclaimableUsedCapacity, + maxArbitrationTimeMs_, + /*localArbitration=*/false); + return std::make_unique( + participant->id(), reclaimedBytes); + })); + if (reclaimTasks.size() > 1) { + globalArbitrationExecutor_->add( + [source = reclaimTasks.back()]() { source->prepare(); }); } - freedBytes += shrinkPool(candidate.pool.get(), 0); - if (freedBytes >= op->requestBytes) { - break; + } + + // NOTE: reclaim task can never fail. + uint64_t reclaimedBytes{0}; + for (auto& reclaimTask : reclaimTasks) { + const auto reclaimResult = reclaimTask->move(); + if (reclaimResult->reclaimedBytes == 0) { + RECORD_METRIC_VALUE(kMetricArbitratorGlobalArbitrationFailedVictimCount); + VELOX_CHECK_EQ(failedParticipants.count(reclaimResult->participantId), 0); + failedParticipants.insert(reclaimResult->participantId); } + reclaimedBytes += reclaimResult->reclaimedBytes; } + VELOX_CHECK_LE(prevReclaimedBytes, reclaimedUsedBytes_); + // NOTE: there might be concurrent local spill or spill triggered by + // external shrink. + return std::max(reclaimedBytes, reclaimedUsedBytes_ - prevReclaimedBytes); } -uint64_t SharedArbitrator::reclaim( - MemoryPool* pool, - uint64_t targetBytes, - bool isLocalArbitration) noexcept { - int64_t bytesToReclaim = - std::min(targetBytes, maxReclaimableCapacity(*pool, true)); - if (bytesToReclaim == 0) { +uint64_t SharedArbitrator::reclaimUsedMemoryByAbort(bool force) { + const auto victimOpt = findAbortCandidate(force); + if (!victimOpt.has_value()) { return 0; } - uint64_t reclaimDurationUs{0}; - uint64_t reclaimedUsedBytes{0}; - uint64_t reclaimedFreeBytes{0}; - MemoryReclaimer::Stats reclaimerStats; + const auto& victim = victimOpt.value(); + try { + VELOX_MEM_POOL_ABORTED(fmt::format( + "Memory pool aborted to reclaim used memory, current capacity {}, " + "memory pool stats:\n{}\n{}", + succinctBytes(victim.participant->pool()->capacity()), + victim.participant->pool()->toString(), + victim.participant->pool()->treeMemoryUsage())); + } catch (VeloxRuntimeError&) { + return abort(victim.participant, std::current_exception()); + } +} + +uint64_t SharedArbitrator::shrink( + const ScopedArbitrationParticipant& participant, + bool reclaimAll) { + const uint64_t freedBytes = participant->shrink(reclaimAll); + freeCapacity(freedBytes); + reclaimedFreeBytes_ += freedBytes; + return freedBytes; +} + +uint64_t SharedArbitrator::reclaim( + const ScopedArbitrationParticipant& participant, + uint64_t targetBytes, + uint64_t timeoutMs, + bool localArbitration) noexcept { + uint64_t reclaimTimeUs{0}; + uint64_t reclaimedBytes{0}; + MemoryReclaimer::Stats stats; { - MicrosecondTimer reclaimTimer(&reclaimDurationUs); - try { - reclaimedFreeBytes = shrinkPool(pool, bytesToReclaim); - bytesToReclaim -= reclaimedFreeBytes; - VELOX_CHECK_GE(bytesToReclaim, 0); - if (bytesToReclaim > 0) { - if (isLocalArbitration) { - incrementLocalArbitrationCount(); - } - pool->reclaim(bytesToReclaim, memoryReclaimWaitMs_, reclaimerStats); - } - } catch (const std::exception& e) { - VELOX_MEM_LOG(ERROR) << "Failed to reclaim from memory pool " - << pool->name() << ", aborting it: " << e.what(); - abort(pool, std::current_exception()); - reclaimedUsedBytes = shrinkPool(pool, 0); - } - reclaimedUsedBytes += shrinkPool(pool, bytesToReclaim); + MicrosecondTimer reclaimTimer(&reclaimTimeUs); + reclaimedBytes = participant->reclaim(targetBytes, timeoutMs, stats); + } + // NOTE: if memory reclaim fails, then the participant is also aborted. If + // it happens, we shall first fail the arbitration operation from the + // aborted participant before returning the freed capacity. + if (participant->aborted()) { + removeGlobalArbitrationWaiter(participant->id()); } - reclaimedUsedBytes_ += reclaimedUsedBytes; - reclaimedFreeBytes_ += reclaimedFreeBytes; - numNonReclaimableAttempts_ += reclaimerStats.numNonReclaimableAttempts; - VELOX_MEM_LOG(INFO) << "Reclaimed from memory pool " << pool->name() + freeCapacity(reclaimedBytes); + + updateMemoryReclaimStats( + reclaimedBytes, reclaimTimeUs / 1'000, localArbitration, stats); + VELOX_MEM_LOG(INFO) << "Reclaimed from memory pool " << participant->name() << " with target of " << succinctBytes(targetBytes) - << ", actually reclaimed " - << succinctBytes(reclaimedFreeBytes) - << " free memory and " - << succinctBytes(reclaimedUsedBytes) - << " used memory, spent " - << succinctMicros(reclaimDurationUs) - << ", isLocalArbitration: " << isLocalArbitration; - return reclaimedUsedBytes + reclaimedFreeBytes; + << ", reclaimed " << succinctBytes(reclaimedBytes) + << ", spent " << succinctMicros(reclaimTimeUs) + << ", local arbitration: " << localArbitration + << " stats " << succinctBytes(stats.reclaimedBytes) + << " numNonReclaimableAttempts " + << stats.numNonReclaimableAttempts; + if (reclaimedBytes == 0) { + FB_LOG_EVERY_MS(WARNING, 1'000) << fmt::format( + "Nothing reclaimed from memory pool {} with reclaim target {}, memory pool stats:\n{}\n{}", + participant->name(), + succinctBytes(targetBytes), + participant->pool()->toString(), + participant->pool()->treeMemoryUsage()); + } + return reclaimedBytes; } -void SharedArbitrator::abort( - MemoryPool* pool, +void SharedArbitrator::updateMemoryReclaimStats( + uint64_t reclaimedBytes, + uint64_t reclaimTimeMs, + bool localArbitration, + const MemoryReclaimer::Stats& stats) { + if (localArbitration) { + incrementLocalArbitrationCount(); + } + reclaimedUsedBytes_ += reclaimedBytes; + numNonReclaimableAttempts_ += stats.numNonReclaimableAttempts; + RECORD_METRIC_VALUE(kMetricQueryMemoryReclaimCount); + RECORD_HISTOGRAM_METRIC_VALUE(kMetricQueryMemoryReclaimTimeMs, reclaimTimeMs); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricQueryMemoryReclaimedBytes, reclaimedBytes); +} + +uint64_t SharedArbitrator::abort( + const ScopedArbitrationParticipant& participant, const std::exception_ptr& error) { RECORD_METRIC_VALUE(kMetricArbitratorAbortedCount); ++numAborted_; - try { - pool->abort(error); - } catch (const std::exception& e) { - VELOX_MEM_LOG(WARNING) << "Failed to abort memory pool " << pool->toString() - << ", error: " << e.what(); - } - // NOTE: no matter memory pool abort throws or not, it should have been marked - // as aborted to prevent any new memory arbitration triggered from the aborted - // memory pool. - VELOX_CHECK(pool->aborted()); + const uint64_t freedBytes = participant->abort(error); + // NOTE: no matter memory pool abort throws or not, it should have been + // marked as aborted to prevent any new memory arbitration triggered from + // the aborted memory pool. + VELOX_CHECK(participant->aborted()); + reclaimedUsedBytes_ += freedBytes; + removeGlobalArbitrationWaiter(participant->id()); + freeCapacity(freedBytes); + return freedBytes; } -void SharedArbitrator::incrementFreeCapacity(uint64_t bytes) { - std::lock_guard l(stateLock_); - incrementFreeCapacityLocked(bytes); +void SharedArbitrator::freeCapacity(uint64_t bytes) { + if (FOLLY_UNLIKELY(bytes == 0)) { + return; + } + std::vector resumes; + { + std::lock_guard l(stateLock_); + freeCapacityLocked(bytes, resumes); + } + for (auto& resume : resumes) { + resume.setValue(); + } } -void SharedArbitrator::incrementFreeCapacityLocked(uint64_t bytes) { - incrementFreeReservedCapacityLocked(bytes); +void SharedArbitrator::freeCapacityLocked( + uint64_t bytes, + std::vector& resumes) { + freeReservedCapacityLocked(bytes); freeNonReservedCapacity_ += bytes; if (FOLLY_UNLIKELY( freeNonReservedCapacity_ + freeReservedCapacity_ > capacity_)) { VELOX_FAIL( - "The free capacity {}/{} is larger than the max capacity {}, {}", + "Free capacity {}/{} is larger than the max capacity {}, {}", succinctBytes(freeNonReservedCapacity_), succinctBytes(freeReservedCapacity_), - succinctBytes(capacity_), - toStringLocked()); + succinctBytes(capacity_)); + } + resumeGlobalArbitrationWaitersLocked(resumes); +} + +void SharedArbitrator::resumeGlobalArbitrationWaitersLocked( + std::vector& resumes) { + auto it = globalArbitrationWaiters_.begin(); + while (it != globalArbitrationWaiters_.end()) { + auto* op = it->second->op; + const uint64_t allocatedBytes = allocateCapacityLocked( + op->participant()->id(), + op->requestBytes(), + op->maxGrowBytes(), + op->minGrowBytes()); + if (allocatedBytes == 0) { + break; + } + VELOX_CHECK_GE(allocatedBytes, op->requestBytes()); + VELOX_CHECK_EQ(it->second->allocatedBytes, 0); + it->second->allocatedBytes = allocatedBytes; + resumes.push_back(std::move(it->second->resumePromise)); + it = globalArbitrationWaiters_.erase(it); } } -void SharedArbitrator::incrementFreeReservedCapacityLocked(uint64_t& bytes) { +void SharedArbitrator::removeGlobalArbitrationWaiter(uint64_t id) { + ContinuePromise resume = ContinuePromise::makeEmpty(); + { + std::lock_guard l(stateLock_); + auto it = globalArbitrationWaiters_.find(id); + if (it != globalArbitrationWaiters_.end()) { + VELOX_CHECK_EQ(it->second->allocatedBytes, 0); + resume = std::move(it->second->resumePromise); + globalArbitrationWaiters_.erase(it); + } + } + if (resume.valid()) { + resume.setValue(); + } +} + +void SharedArbitrator::freeReservedCapacityLocked(uint64_t& bytes) { VELOX_CHECK_LE(freeReservedCapacity_, reservedCapacity_); const uint64_t freedBytes = std::min(bytes, reservedCapacity_ - freeReservedCapacity_); @@ -1016,7 +1281,7 @@ MemoryArbitrator::Stats SharedArbitrator::stats() const { MemoryArbitrator::Stats SharedArbitrator::statsLocked() const { Stats stats; stats.numRequests = numRequests_; - stats.numRunning = numPending_; + stats.numRunning = numRunning_; stats.numAborted = numAborted_; stats.numFailures = numFailures_; stats.reclaimedFreeBytes = reclaimedFreeBytes_; @@ -1030,134 +1295,44 @@ MemoryArbitrator::Stats SharedArbitrator::statsLocked() const { std::string SharedArbitrator::toString() const { std::lock_guard l(stateLock_); - return toStringLocked(); -} - -std::string SharedArbitrator::toStringLocked() const { return fmt::format( - "ARBITRATOR[{} CAPACITY[{}] PENDING[{}] {}]", + "ARBITRATOR[{} CAPACITY[{}] {}]", kind_, succinctBytes(capacity_), - numPending_, statsLocked().toString()); } SharedArbitrator::ScopedArbitration::ScopedArbitration( SharedArbitrator* arbitrator, ArbitrationOperation* operation) - : operation_(operation), - arbitrator_(arbitrator), - arbitrationCtx_( - operation->requestPool == nullptr - ? std::make_unique() - : std::make_unique( - operation->requestPool)), + : arbitrator_(arbitrator), + operation_(operation), + arbitrationCtx_(operation->participant()->pool()), startTime_(std::chrono::steady_clock::now()) { VELOX_CHECK_NOT_NULL(arbitrator_); VELOX_CHECK_NOT_NULL(operation_); - if (arbitrator_->arbitrationStateCheckCb_ != nullptr && - operation_->requestPool != nullptr) { - arbitrator_->arbitrationStateCheckCb_(*operation_->requestPool); + if (arbitrator_->arbitrationStateCheckCb_ != nullptr) { + arbitrator_->arbitrationStateCheckCb_(*operation_->participant()->pool()); } arbitrator_->startArbitration(operation_); } SharedArbitrator::ScopedArbitration::~ScopedArbitration() { arbitrator_->finishArbitration(operation_); - - // Report arbitration operation stats. - const auto arbitrationTimeUs = - std::chrono::duration_cast( - std::chrono::steady_clock::now() - operation_->startTime) - .count(); - RECORD_HISTOGRAM_METRIC_VALUE( - kMetricArbitratorOpExecTimeMs, arbitrationTimeUs / 1'000); - addThreadLocalRuntimeStat( - kMemoryArbitrationWallNanos, - RuntimeCounter(arbitrationTimeUs * 1'000, RuntimeCounter::Unit::kNanos)); - if (operation_->localArbitrationQueueTimeUs != 0) { - addThreadLocalRuntimeStat( - kLocalArbitrationQueueWallNanos, - RuntimeCounter( - operation_->localArbitrationQueueTimeUs * 1'000, - RuntimeCounter::Unit::kNanos)); - } - if (operation_->localArbitrationLockWaitTimeUs != 0) { - addThreadLocalRuntimeStat( - kLocalArbitrationLockWaitWallNanos, - RuntimeCounter( - operation_->localArbitrationLockWaitTimeUs * 1'000, - RuntimeCounter::Unit::kNanos)); - } - if (operation_->globalArbitrationLockWaitTimeUs != 0) { - addThreadLocalRuntimeStat( - kGlobalArbitrationLockWaitWallNanos, - RuntimeCounter( - operation_->globalArbitrationLockWaitTimeUs * 1'000, - RuntimeCounter::Unit::kNanos)); - } } -void SharedArbitrator::startArbitration(ArbitrationOperation* op) { - updateArbitrationRequestStats(); - ContinueFuture waitPromise{ContinueFuture::makeEmpty()}; - { - std::lock_guard l(stateLock_); - ++numPending_; - if (op->requestPool != nullptr) { - auto it = arbitrationQueues_.find(op->requestPool); - if (it != arbitrationQueues_.end()) { - it->second->waitPromises.emplace_back( - fmt::format("Wait for arbitration {}", op->requestPool->name())); - waitPromise = it->second->waitPromises.back().getSemiFuture(); - } else { - arbitrationQueues_.emplace( - op->requestPool, std::make_unique(op)); - } - } - } - - TestValue::adjust( - "facebook::velox::memory::SharedArbitrator::startArbitration", this); - - if (waitPromise.valid()) { - uint64_t waitTimeUs{0}; - { - MicrosecondTimer timer(&waitTimeUs); - waitPromise.wait(); - } - op->localArbitrationQueueTimeUs += waitTimeUs; - } -} - -void SharedArbitrator::finishArbitration(ArbitrationOperation* op) { - ContinuePromise resumePromise{ContinuePromise::makeEmpty()}; - { - std::lock_guard l(stateLock_); - VELOX_CHECK_GT(numPending_, 0); - --numPending_; - if (op->requestPool != nullptr) { - auto it = arbitrationQueues_.find(op->requestPool); - VELOX_CHECK( - it != arbitrationQueues_.end(), - "{} not found", - op->requestPool->name()); - auto* runningArbitration = it->second.get(); - if (runningArbitration->waitPromises.empty()) { - arbitrationQueues_.erase(it); - } else { - resumePromise = std::move(runningArbitration->waitPromises.back()); - runningArbitration->waitPromises.pop_back(); - } - } - } - if (resumePromise.valid()) { - resumePromise.setValue(); - } +SharedArbitrator::GlobalArbitrationSection::GlobalArbitrationSection( + SharedArbitrator* arbitrator) + : arbitrator_(arbitrator) { + VELOX_CHECK_NOT_NULL(arbitrator_); + VELOX_CHECK(!arbitrator_->globalArbitrationRunning_); + arbitrator_->globalArbitrationRunning_ = true; } -bool SharedArbitrator::isUnderArbitrationLocked(MemoryPool* pool) const { - return arbitrationQueues_.count(pool) != 0; +SharedArbitrator::GlobalArbitrationSection::~GlobalArbitrationSection() { + VELOX_CHECK(arbitrator_->globalArbitrationRunning_); + arbitrator_->globalArbitrationRunning_ = false; + ; } std::string SharedArbitrator::kind() const { @@ -1175,10 +1350,11 @@ void SharedArbitrator::unregisterFactory() { MemoryArbitrator::unregisterFactory(kind_); } -void SharedArbitrator::incrementGlobalArbitrationCount() { - RECORD_METRIC_VALUE(kMetricArbitratorGlobalArbitrationCount); +void SharedArbitrator::incrementGlobalArbitrationWaitCount() { + RECORD_METRIC_VALUE(kMetricArbitratorGlobalArbitrationWaitCount); addThreadLocalRuntimeStat( - kGlobalArbitrationCount, RuntimeCounter(1, RuntimeCounter::Unit::kNone)); + kGlobalArbitrationWaitCount, + RuntimeCounter(1, RuntimeCounter::Unit::kNone)); } void SharedArbitrator::incrementLocalArbitrationCount() { diff --git a/velox/common/memory/SharedArbitrator.h b/velox/common/memory/SharedArbitrator.h index e52ff2094a49..00e5dbe3aa6f 100644 --- a/velox/common/memory/SharedArbitrator.h +++ b/velox/common/memory/SharedArbitrator.h @@ -18,23 +18,38 @@ #include +#include #include "velox/common/base/Counters.h" #include "velox/common/base/GTestMacros.h" #include "velox/common/base/StatsReporter.h" #include "velox/common/future/VeloxPromise.h" +#include "velox/common/memory/ArbitrationOperation.h" +#include "velox/common/memory/ArbitrationParticipant.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryArbitrator.h" namespace facebook::velox::memory { +namespace test { +class SharedArbitratorTestHelper; +} -/// Used to achieve dynamic memory sharing among running queries. When a +/// Used to achieve dynamic memory sharing among running queries. When a query /// memory pool exceeds its current memory capacity, the arbitrator tries to -/// grow its capacity by reclaim the overused memory from the query with -/// more memory usage. We can configure memory arbitrator the way to reclaim -/// memory. For Prestissimo, we can configure it to reclaim memory by -/// aborting a query. For Prestissimo-on-Spark, we can configure it to -/// reclaim from a running query through techniques such as disk-spilling, -/// partial aggregation or persistent shuffle data flushes. +/// grow its capacity through memory arbitration. If the query memory pool +/// exceeds its max memory capacity, then the arbitrator reclaims used memory +/// from the the query itself which is the local arbitration. If not, the +/// arbitrator tries to grow its capacity with the free unused capacity or +/// reclaim the unused memory from other running queries. If there is still +/// not enough free capacity, the arbitrator kicks off the global arbitration +/// running at the background to reclaim used memory from other running queries. +/// The request query memory pool waits until the global arbitration reclaims +/// enough memory to grow its capacity or fails if exceeds the max arbitration +/// time limit. The background global arbitration runs by a single thread while +/// the actual memory reclaim is executed by a thread pool to parallelize the +/// memory reclamation from multiple running queries at the same time. The +/// global arbitration first tries to reclaim memory by disk spilling and if it +/// can't quickly reclaim enough memory, it then switchs to abort the younger +/// queries which also have more memory usageh. class SharedArbitrator : public memory::MemoryArbitrator { public: struct ExtraConfig { @@ -68,7 +83,7 @@ class SharedArbitrator : public memory::MemoryArbitrator { /// timeout. static constexpr std::string_view kMemoryReclaimMaxWaitTime{ "memory-reclaim-max-wait-time"}; - static constexpr std::string_view kDefaultMemoryReclaimMaxWaitTime{"0ms"}; + static constexpr std::string_view kDefaultMemoryReclaimMaxWaitTime{"5m"}; static uint64_t memoryReclaimMaxWaitTimeMs( const std::unordered_map& configs); @@ -99,12 +114,33 @@ class SharedArbitrator : public memory::MemoryArbitrator { static double memoryPoolMinFreeCapacityPct( const std::unordered_map& configs); - /// If true, it allows memory arbitrator to reclaim used memory cross query - /// memory pools. - static constexpr std::string_view kGlobalArbitrationEnabled{ - "global-arbitration-enabled"}; - static constexpr bool kDefaultGlobalArbitrationEnabled{false}; - static bool globalArbitrationEnabled( + /// Specifies the minimum bytes to reclaim from a participant at a time. The + /// global arbitration also avoids to reclaim from a participant if its + /// reclaimable used capacity is less than this threshold. This is to + /// prevent inefficient memory reclaim operations on a participant with + /// small reclaimable used capacity which could causes a large number of + /// small spilled file on disk. + static constexpr std::string_view kMemoryPoolMinReclaimBytes{ + "memory-pool-min-reclaim-bytes"}; + static constexpr std::string_view kDefaultMemoryPoolMinReclaimBytes{ + "128MB"}; + static uint64_t memoryPoolMinReclaimBytes( + const std::unordered_map& configs); + + /// Specifies the starting memory capacity limit for global arbitration to + /// search for victim participant to reclaim used memory by abort. For + /// participants with capacity larger than the limit, the global arbitration + /// choose to abort the youngest participant which has the largest + /// participant id. This helps to let the old queries to run to completion. + /// The abort capacity limit is reduced by half if could not find a victim + /// participant until this reaches to zero. + /// + /// NOTE: the limit must be zero or a power of 2. + static constexpr std::string_view kMemoryPoolAbortCapacityLimit{ + "memory-pool-abort-capacity-limit"}; + static constexpr std::string_view kDefaultMemoryPoolAbortCapacityLimit{ + "1GB"}; + static uint64_t memoryPoolAbortCapacityLimit( const std::unordered_map& configs); /// When growing capacity, the growth bytes will be adjusted in the @@ -138,6 +174,33 @@ class SharedArbitrator : public memory::MemoryArbitrator { static double slowCapacityGrowPct( const std::unordered_map& configs); + /// If true, allows memory arbitrator to reclaim used memory cross query + /// memory pools. + static constexpr std::string_view kGlobalArbitrationEnabled{ + "global-arbitration-enabled"}; + static constexpr bool kDefaultGlobalArbitrationEnabled{true}; + static bool globalArbitrationEnabled( + const std::unordered_map& configs); + + /// If not zero, specifies the minimum amount of memory to reclaim by global + /// memory arbitration as percentage of total arbitrator memory capacity. + static constexpr std::string_view kGlobalArbitrationMemoryReclaimPct{ + "global-arbitration-memory-reclaim-pct"}; + static constexpr uint32_t kDefaultGlobalMemoryArbitrationReclaimPct{10}; + static uint32_t globalArbitrationMemoryReclaimPct( + const std::unordered_map& configs); + + /// Floating point number used in calculating how many threads we would use + /// for global arbitration memory reclaim executor: hw_concurrency x + /// multiplier. 1.0 is default. + static constexpr std::string_view + kGlobalArbitrationReclaimThreadsHwMultiplier{ + "global-arbitration-reclaim-threads-hw-multiplier"}; + static constexpr double kDefaultGlobalArbitrationReclaimThreadsHwMultiplier{ + 0.5}; + static double globalArbitrationReclaimThreadsHwMultiplier( + const std::unordered_map& configs); + /// If true, do sanity check on the arbitrator state on destruction. /// /// TODO: deprecate this flag after all the existing memory leak use cases @@ -162,7 +225,8 @@ class SharedArbitrator : public memory::MemoryArbitrator { bool growCapacity(MemoryPool* pool, uint64_t requestBytes) final; - uint64_t shrinkCapacity(MemoryPool* pool, uint64_t requestBytes = 0) final; + /// NOTE: only support shrinking away all the unused free capacity for now. + uint64_t shrinkCapacity(MemoryPool* pool, uint64_t requestBytes) final; uint64_t shrinkCapacity( uint64_t requestBytes, @@ -175,145 +239,90 @@ class SharedArbitrator : public memory::MemoryArbitrator { std::string toString() const final; - /// Returns 'freeCapacity' back to the arbitrator for testing. - void testingFreeCapacity(uint64_t freeCapacity); - - uint64_t testingNumRequests() const; - - /// Enables/disables global arbitration accordingly. - void testingSetGlobalArbitration(bool enableGlobalArbitration) { - *const_cast(&globalArbitrationEnabled_) = enableGlobalArbitration; - } - - /// Operator level runtime stats that are reported during a shared arbitration - /// attempt. + /// Operator level runtime stats reported for an arbitration operation + /// execution. static inline const std::string kMemoryArbitrationWallNanos{ "memoryArbitrationWallNanos"}; - static inline const std::string kGlobalArbitrationCount{ - "globalArbitrationCount"}; static inline const std::string kLocalArbitrationCount{ "localArbitrationCount"}; - static inline const std::string kLocalArbitrationQueueWallNanos{ - "localArbitrationQueueWallNanos"}; - static inline const std::string kLocalArbitrationLockWaitWallNanos{ - "localArbitrationLockWaitWallNanos"}; - static inline const std::string kGlobalArbitrationLockWaitWallNanos{ - "globalArbitrationLockWaitWallNanos"}; - - /// The candidate memory pool stats used by arbitration. - struct Candidate { - std::shared_ptr pool; - int64_t reclaimableBytes{0}; - int64_t freeBytes{0}; - int64_t reservedBytes{0}; - - std::string toString() const; - }; + static inline const std::string kLocalArbitrationWaitWallNanos{ + "localArbitrationWaitWallNanos"}; + static inline const std::string kLocalArbitrationExecutionWallNanos{ + "localArbitrationExecutionWallNanos"}; + static inline const std::string kGlobalArbitrationWaitCount{ + "globalArbitrationWaitCount"}; + static inline const std::string kGlobalArbitrationWaitWallNanos{ + "globalArbitrationWaitWallNanos"}; private: // The kind string of shared arbitrator. inline static const std::string kind_{"SHARED"}; - // Contains the execution state of an arbitration operation. - struct ArbitrationOperation { - MemoryPool* const requestPool; - const uint64_t requestBytes; - - // The adjusted grow bytes based on 'requestBytes'. This 'targetBytes' is a - // best effort target, and hence will not be guaranteed. The adjustment is - // based on 'SharedArbitrator::fastExponentialGrowthCapacityLimit_' - // 'SharedArbitrator::slowCapacityGrowPct_' - const std::optional targetBytes; - - // The start time of this arbitration operation. - const std::chrono::steady_clock::time_point startTime; - - // The candidate memory pools. - std::vector candidates; - - // The time that waits in local arbitration queue. - uint64_t localArbitrationQueueTimeUs{0}; - - // The time that waits to acquire the local arbitration lock. - uint64_t localArbitrationLockWaitTimeUs{0}; - - // The time that waits to acquire the global arbitration lock. - uint64_t globalArbitrationLockWaitTimeUs{0}; - - explicit ArbitrationOperation(uint64_t requestBytes) - : ArbitrationOperation(nullptr, requestBytes, std::nullopt) {} - - ArbitrationOperation( - MemoryPool* _requestor, - uint64_t _requestBytes, - std::optional _targetBytes) - : requestPool(_requestor), - requestBytes(_requestBytes), - targetBytes(_targetBytes), - startTime(std::chrono::steady_clock::now()) { - VELOX_CHECK(requestPool == nullptr || requestPool->isRoot()); - } - - uint64_t waitTimeUs() const { - return localArbitrationQueueTimeUs + localArbitrationLockWaitTimeUs + - globalArbitrationLockWaitTimeUs; - } - }; - - // Used to start and finish an arbitration operation initiated from a memory - // pool or memory capacity shrink request sent through shrinkPools() API. + // Used to manage an arbitration operation execution. It starts 'op' execution + // in ctor and finishes its exection in dtor. class ScopedArbitration { public: - ScopedArbitration(SharedArbitrator* arbitrator, ArbitrationOperation* op); + explicit ScopedArbitration( + SharedArbitrator* arbitrator, + ArbitrationOperation* op); ~ScopedArbitration(); private: - ArbitrationOperation* const operation_{nullptr}; SharedArbitrator* const arbitrator_; - const std::unique_ptr arbitrationCtx_; + ArbitrationOperation* const operation_; + const ScopedMemoryArbitrationContext arbitrationCtx_; const std::chrono::steady_clock::time_point startTime_; }; - // The arbitration running queue for arbitration requests from the same query - // pool. - struct ArbitrationQueue { - // Points to the current running arbitration. - ArbitrationOperation* current; - - // The promises of the arbitration requests from the same query pool waiting - // for the serial execution. - std::vector waitPromises; + class GlobalArbitrationSection { + public: + explicit GlobalArbitrationSection(SharedArbitrator* arbitrator); + ~GlobalArbitrationSection(); - explicit ArbitrationQueue(ArbitrationOperation* op) : current(op) { - VELOX_CHECK_NOT_NULL(current); - } + private: + SharedArbitrator* const arbitrator_; + const memory::ScopedMemoryArbitrationContext arbitrationCtx_{}; }; - // Invoked to check if the memory growth will exceed the memory pool's max - // capacity limit or the arbitrator's node capacity limit. - bool checkCapacityGrowth(ArbitrationOperation* op) const; + // Invoked to get the arbitration participant by 'name'. The function returns + // std::nullopt if the underlying query memory pool is destroyed. + std::optional getParticipant( + const std::string& name) const; - // Invoked to ensure the memory growth request won't exceed the request memory - // pool's max capacity as well as the arbitrator's node capacity. If it does, - // then we first need to reclaim the used memory from the request memory pool - // itself to ensure the memory growth won't exceed the capacity limit, and - // then proceed with the memory arbitration process across queries. - bool ensureCapacity(ArbitrationOperation* op); + // Invoked to create an operation for an arbitration request from given query + // memory 'pool'. + ArbitrationOperation createArbitrationOperation( + MemoryPool* pool, + uint64_t requestBytes); - // Invoked to reclaim the memory from the other query memory pools to grow the - // request memory pool's capacity. - bool arbitrateMemory(ArbitrationOperation* op); + // Run arbitration to grow capacity for 'op'. The function returns true on + // success. + bool growCapacity(ArbitrationOperation& op); + + // Gets the mim/max memory capacity growth targets for 'op' once after it + // starts to run. + void getGrowTargets( + ArbitrationOperation& op, + uint64_t& maxGrowTarget, + uint64_t& minGrowTarget); - // Invoked to start next memory arbitration request, and it will wait for - // the serialized execution if there is a running or other waiting - // arbitration requests. + // Invoked to start execution of 'op'. It waits for the serialized execution + // on the same arbitration participant and returns when 'op' is ready to run. void startArbitration(ArbitrationOperation* op); - // Invoked by a finished memory arbitration request to kick off the next - // arbitration request execution if there are any ones waiting. + // Invoked when 'op' has finished. The function kicks off the next arbitration + // operation waiting on the same participant to run if there is one. void finishArbitration(ArbitrationOperation* op); + // Invoked to check if the capacity growth exceeds the participant's max + // capacity limit or the arbitrator's capacity limit. + bool checkCapacityGrowth(ArbitrationOperation& op) const; + + // Invoked to ensure the capacity growth won't exceed the participant's max + // capacity limit by reclaiming used memory from the participant itself. + bool ensureCapacity(ArbitrationOperation& op); + // Invoked to run local arbitration on the request memory pool. It first // ensures the memory growth is within both memory pool and arbitrator // capacity limits. This step might reclaim the used memory from the request @@ -326,218 +335,274 @@ class SharedArbitrator : public memory::MemoryArbitrator { // returns false on failure. Otherwise, it needs to further check if // 'needGlobalArbitration' is true or not. If true, needs to proceed with the // global arbitration run. - bool runLocalArbitration( - ArbitrationOperation* op, - bool& needGlobalArbitration); + + // Invoked to initialize the global arbitration on arbitrator start-up. It + // starts the background threads to used memory from running queries + // on-demand. + void setupGlobalArbitration(); + + // Invoked to stop the global arbitration threads on shut-down. + void shutdownGlobalArbitration(); + + // The main function of the global arbitration control thread. + void globalArbitrationMain(); + + // Invoked by arbitration operation to wake up the global arbitration control + // thread to reclaim used memory when there is no free capacity in the system. + void wakeupGlobalArbitrationThread(); + + // Invoked by global arbitration control thread to run global arbitration. + void runGlobalArbitration(); + + // Invoked to get the global arbitration target in bytes. + uint64_t getGlobalArbitrationTarget(); // Invoked to run global arbitration to reclaim free or used memory from the // other queries. The global arbitration run is protected by the exclusive // lock of 'arbitrationLock_' for serial execution mode. The function returns // true on success, false on failure. - bool runGlobalArbitration(ArbitrationOperation* op); + bool startAndWaitGlobalArbitration(ArbitrationOperation& op); - // Gets the mim/max memory capacity growth targets for 'op'. The min and max - // targets are calculated based on memoryPoolReservedCapacity_ requirements - // and the pool's max capacity. - void getGrowTargets( - ArbitrationOperation* op, - uint64_t& maxGrowTarget, - uint64_t& minGrowTarget); + // Invoked to get stats of candidate participants for arbitration. If + // 'freeCapacityOnly' is true, then we only get reclaimable free capacity from + // each participant. + std::vector getCandidates( + bool freeCapacityOnly = false); - // Invoked to get or refresh the candidate memory pools for arbitration. If - // 'freeCapacityOnly' is true, then we only get free capacity stats for each - // candidate memory pool. - void getCandidates(ArbitrationOperation* op, bool freeCapacityOnly = false); + // Invoked to reclaim unused memory capacity from participants without + // actually freeing used memory. The function returns the actually reclaimed + // free capacity in bytes. + uint64_t reclaimUnusedCapacity(); // Sorts 'candidates' based on reclaimable free capacity in descending order. static void sortCandidatesByReclaimableFreeCapacity( - std::vector& candidates); + std::vector& candidates); + + // Invoked to reclaim the specified used memory capacity from one or more + // participants in parallel by spilling. 'reclaimedParticipants' tracks the + // participants that have been reclaimed by spill across multiple global + // arbitration runs. 'failedParticipants' tracks the participants that have + // failed to reclaim any memory by spill. This could happen if there is some + // unknown bug or limitation in specific spillable operator implementation. + // Correspondingly, the global arbitration shall skip reclaiming from those + // participants in next arbitration round. 'allParticipantsReclaimed' + // indicates if all participants have been reclaimed by spill so far. It is + // used by gllobal arbitration to decide if need to switch to abort to reclaim + // used memory in the next arbitration round. The function returns the + // actually reclaimed used capacity in bytes. + // + // NOTE: the function sort participants based on their reclaimable used memory + // capacity, and reclaim from participants with larger reclaimable used memory + // first. + uint64_t reclaimUsedMemoryBySpill( + uint64_t targetBytes, + std::unordered_set& reclaimedParticipants, + std::unordered_set& failedParticipants, + bool& allParticipantsReclaimed); + + uint64_t reclaimUsedMemoryBySpill(uint64_t targetBytes); // Sorts 'candidates' based on reclaimable used capacity in descending order. static void sortCandidatesByReclaimableUsedCapacity( - std::vector& candidates); - - // Sorts 'candidates' based on actual used memory in descending order. - static void sortCandidatesByUsage(std::vector& candidates); - - // Finds the candidate with the largest capacity. For 'requestor', the - // capacity for comparison including its current capacity and the capacity to - // grow. - static const SharedArbitrator::Candidate& findCandidateWithLargestCapacity( - MemoryPool* requestor, - uint64_t targetBytes, - const std::vector& candidates); - - // Invoked to reclaim free memory capacity from 'candidates' without - // actually freeing used memory. - // - // NOTE: the function might sort 'candidates' based on each candidate's free - // capacity internally. - uint64_t reclaimFreeMemoryFromCandidates( - ArbitrationOperation* op, - uint64_t reclaimTargetBytes, - bool isLocalArbitration); - - // Invoked to reclaim used memory capacity from 'candidates' by spilling. - // - // NOTE: the function might sort 'candidates' based on each candidate's - // reclaimable memory internally. - void reclaimUsedMemoryFromCandidatesBySpill( - ArbitrationOperation* op, - uint64_t& freedBytes); - - // Invoked to reclaim used memory capacity from 'candidates' by aborting the - // top memory users' queries. - void reclaimUsedMemoryFromCandidatesByAbort( - ArbitrationOperation* op, - uint64_t& freedBytes); - - // Checks if request pool has been aborted or not. - void checkIfAborted(ArbitrationOperation* op); - - // Checks if the request pool already has enough free capacity for the growth. - // This could happen if there are multiple arbitration operations from the - // same query. When the first served operation succeeds, it might have - // reserved enough capacity for the followup operations. - bool maybeGrowFromSelf(ArbitrationOperation* op); - - // Invoked to grow 'pool' capacity by 'growBytes' and commit used reservation - // by 'reservationBytes'. The function throws if the growth fails. - void - checkedGrow(MemoryPool* pool, uint64_t growBytes, uint64_t reservationBytes); - - // Invoked to reclaim used memory from 'targetPool' with specified + std::vector& candidates); + + // Invoked to reclaim the used memory capacity to abort the participant with + // the largest capacity to free up memory. The function returns the actually + // reclaimed capacity in bytes. The function returns zero if there is no + // eligible participant to abort. If 'force' is true, it picks up the youngest + // participant which has largest participant id to abort if there is no + // eligible one. + uint64_t reclaimUsedMemoryByAbort(bool force); + + // Finds the participant victim to abort to free used memory based on the + // participant's memory capacity and age. The function returns std::nullopt if + // there is no eligible candidate. If 'force' is true, it picks up the + // youngest participant to abort if there is no eligible one. + std::optional findAbortCandidate(bool force); + + // Invoked to use free capacity from arbitrator to grow participant's + // capacity. + bool growWithFreeCapacity(ArbitrationOperation& op); + + // Checks if the operation has been aborted or not. The function throws if + // aborted. + void checkIfAborted(ArbitrationOperation& op); + + // Checks if the operation has timed out or not. The function throws if timed + // out. + void checkIfTimeout(ArbitrationOperation& op); + + // Checks if the request participant already has enough free capacity for the + // growth. This could happen if there are multiple arbitration operations from + // the same participant. When the first served operation succeeds, it might + // have reserved enough capacity for the followup operations. + bool maybeGrowFromSelf(ArbitrationOperation& op); + + // Invoked to grow 'participant' capacity by 'growBytes' and commit used + // reservation by 'reservationBytes'. The function throws if the growth fails. + void checkedGrow( + const ScopedArbitrationParticipant& participant, + uint64_t growBytes, + uint64_t reservationBytes); + + // Invoked to reclaim used memory from 'participant' with specified // 'targetBytes'. The function returns the actually freed capacity. - // 'isLocalArbitration' is true when the reclaim attempt is within a local + // 'localArbitration' is true when the reclaim attempt is for a local // arbitration. uint64_t reclaim( - MemoryPool* targetPool, + const ScopedArbitrationParticipant& participant, uint64_t targetBytes, - bool isLocalArbitration) noexcept; - - // Invoked to abort memory 'pool'. - void abort(MemoryPool* pool, const std::exception_ptr& error); - - // Invoked to handle the memory arbitration failure to abort the memory pool - // with the largest capacity to free up memory. The function returns true on - // success and false if the requestor itself has been selected as the - // victim. We don't abort the requestor itself but just fails the - // arbitration to let the user decide to either proceed with the query or - // fail it. - bool handleOOM(ArbitrationOperation* op); - - // Decrements free capacity from the arbitrator with up to - // 'maxBytesToReserve'. The arbitrator might have less free available - // capacity. The function returns the actual decremented free capacity - // bytes. If 'minBytesToReserve' is not zero and there is less than - // 'minBytes' available in non-reserved capacity, then the arbitrator tries - // to decrement up to 'minBytes' from the reserved capacity. - uint64_t decrementFreeCapacity( - uint64_t maxBytesToReserve, - uint64_t minBytesToReserve); - uint64_t decrementFreeCapacityLocked( - uint64_t maxBytesToReserve, - uint64_t minBytesToReserve); - - // Increment free capacity by 'bytes'. - void incrementFreeCapacity(uint64_t bytes); - void incrementFreeCapacityLocked(uint64_t bytes); - // Increments the free reserved capacity up to 'bytes' until reaches to the - // reserved capacity limit. 'bytes' is updated accordingly. - void incrementFreeReservedCapacityLocked(uint64_t& bytes); - - void incrementGlobalArbitrationCount(); + uint64_t timeoutMs, + bool localArbitration) noexcept; + + uint64_t shrink( + const ScopedArbitrationParticipant& participant, + bool reclaimAll); + + // Invoked to abort 'participant' with 'error'. + uint64_t abort( + const ScopedArbitrationParticipant& participant, + const std::exception_ptr& error); + + // Allocates capacity for a given participant with 'requestBytes'. The + // arbitrator might allocate up to 'maxAllocateBytes'. If there is not enough + // capacity in non-reserved free capacity pool, then the arbitrator tries to + // allocate up to 'minAllocateBytes' from the reserved capacity pool. The + // function returns the allocated bytes. It is set to a value no less than + // 'requestBytes' on success and zero on failure. + uint64_t allocateCapacity( + uint64_t participantId, + uint64_t requestBytes, + uint64_t maxAllocateBytes, + uint64_t minAllocateBytes); + + uint64_t allocateCapacityLocked( + uint64_t participantId, + uint64_t requestBytes, + uint64_t maxAllocateBytes, + uint64_t minAllocateBytes); + + // Invoked to free capacity back to the arbitrator, and wake up the global + // arbitration waiters if there is sufficient free capacity. + void freeCapacity(uint64_t bytes); + + // 'resumes' contains the global arbitration waiters to resume. + void freeCapacityLocked( + uint64_t bytes, + std::vector& resumes); + + // Frees reserved capacity up to 'bytes' until reaches to the reserved + // capacity limit. 'bytes' is updated accordingly. + void freeReservedCapacityLocked(uint64_t& bytes); + + // Invoked by freeCapacity() to resume a set of oldest global arbitration + // waiters that could be fulfilled their global arbitration requests from + // current available free capacity. + void resumeGlobalArbitrationWaitersLocked( + std::vector& resumes); + + // Removes the arbitration operation with 'id' from the global arbitration + // wait list. It is invoked by participant abort or global arbitration wait + // time out. + void removeGlobalArbitrationWaiter(uint64_t id); + + // Increments the global arbitration wait count in both arbitrator and the + // corresponding operator's runtime stats. + void incrementGlobalArbitrationWaitCount(); + + // Increments the local arbitration count in both arbitrator and the + // corresponding operator's runtime stats. void incrementLocalArbitrationCount(); - std::string toStringLocked() const; + size_t numParticipants() const { + std::shared_lock l(participantLock_); + return participants_.size(); + } Stats statsLocked() const; - // Returns the max reclaimable capacity from 'pool' which includes both used - // and free capacities. If 'isSelfReclaim' true, we reclaim memory from the - // request pool itself so that we can bypass the reserved free capacity - // reclaim restriction. - int64_t maxReclaimableCapacity(const MemoryPool& pool, bool isSelfReclaim) - const; - - // Returns the free memory capacity that can be reclaimed from 'pool' by - // shrink. If 'isSelfReclaim' true, we reclaim memory from the request pool - // itself so that we can bypass the reserved free capacity reclaim - // restriction. - int64_t reclaimableFreeCapacity(const MemoryPool& pool, bool isSelfReclaim) - const; - - // Returns the used memory capacity that can be reclaimed from 'pool' by - // disk spill. If 'isSelfReclaim' true, we reclaim memory from the request - // pool itself so that we can bypass the reserved free capacity reclaim - // restriction. - int64_t reclaimableUsedCapacity(const MemoryPool& pool, bool isSelfReclaim) - const; - - // Returns the minimal amount of memory capacity to grow for 'pool' to have - // the reserved capacity as specified by 'memoryPoolReservedCapacity_'. - int64_t minGrowCapacity(const MemoryPool& pool) const; - - // The capacity growth target is set to have a coarser granularity. It can - // help to reduce the number of future grow calls, and hence reducing the - // number of unnecessary memory arbitration requests. - uint64_t getCapacityGrowthTarget( - const MemoryPool& pool, - uint64_t requestBytes) const; - - // The capacity shrink target is adjusted from request shrink bytes to give - // the memory pool more headroom free capacity after shrink. It can help to - // reduce the number of future grow calls, and hence reducing the number of - // unnecessary memory arbitration requests. - uint64_t getCapacityShrinkTarget( - const MemoryPool& pool, - uint64_t requestBytes) const; - - // Returns true if 'pool' is under memory arbitration. - bool isUnderArbitrationLocked(MemoryPool* pool) const; + void updateMemoryReclaimStats( + uint64_t reclaimedBytes, + uint64_t reclaimTimeMs, + bool localArbitration, + const MemoryReclaimer::Stats& stats); void updateArbitrationRequestStats(); void updateArbitrationFailureStats(); + void updateGlobalArbitrationStats( + uint64_t arbitrationTimeMs, + uint64_t arbitrationBytes); + const uint64_t reservedCapacity_; - const uint64_t memoryPoolInitialCapacity_; - const uint64_t memoryPoolReservedCapacity_; - const uint64_t memoryReclaimWaitMs_; + const uint64_t maxArbitrationTimeMs_; + const ArbitrationParticipant::Config participantConfig_; const bool globalArbitrationEnabled_; + const uint32_t globalArbitrationMemoryReclaimPct_; + const double globalArbitrationReclaimThreadsHwMultiplier_; const bool checkUsageLeak_; - const uint64_t fastExponentialGrowthCapacityLimit_; - const double slowCapacityGrowPct_; - const uint64_t memoryPoolMinFreeCapacity_; - const double memoryPoolMinFreeCapacityPct_; - - mutable folly::SharedMutex poolLock_; - std::unordered_map> candidates_; + std::atomic_uint64_t nextParticipantId_{0}; + mutable folly::SharedMutex participantLock_; + std::unordered_map> + participants_; - // Lock used to protect the arbitrator state. + // Lock used to protect the arbitrator internal state. mutable std::mutex stateLock_; + tsan_atomic freeReservedCapacity_{0}; tsan_atomic freeNonReservedCapacity_{0}; - // Contains the arbitration running queues with one per each query memory - // pool. - std::unordered_map> - arbitrationQueues_; + bool globalArbitrationStop_{false}; + // Indicates if the global arbitration is currently running or not. + tsan_atomic globalArbitrationRunning_{false}; + + // The abort capacity limits listed in descending order. It is used by global + // arbitration to choose the victim to abort. It starts with the largest limit + // and abort the youngest participant whose capacity is larger than the limit. + // If there is no such participant, it goes to the next limit and so on. + std::vector globalArbitrationAbortCapacityLimits_; + // The executor used to reclaim memory from multiple participants in parallel + // at the background for global arbitration. + std::unique_ptr globalArbitrationExecutor_; + // The global arbitration control thread which runs the global arbitration at + // the background, and dispatch the actual memory reclaim work on different + // participants to 'globalArbitrationExecutor_' and collects the results back. + std::unique_ptr globalArbitrationController_; + // Signal used to wakeup 'globalArbitrationController_' to run global + // arbitration on-demand. + std::condition_variable globalArbitrationThreadCv_; + + // Records an arbitration operation waiting for global memory arbitration. + struct ArbitrationWait { + ArbitrationOperation* op; + ContinuePromise resumePromise; + uint64_t allocatedBytes{0}; + + ArbitrationWait(ArbitrationOperation* _op, ContinuePromise&& _resumePromise) + : op(_op), resumePromise(std::move(_resumePromise)) {} + }; + + // The map of global arbitration waiters. The key is the arbitration operation + // id which is set to id the of the corresponding arbitration participant. + // This ensures to satisfy the arbitration request in the order of the age of + // arbitration participants with old participants being served first. + std::map globalArbitrationWaiters_; - // R/W lock used to control local and global arbitration runs. A local - // arbitration run needs to hold a shared lock while the latter needs to hold - // an exclusive lock. Hence, multiple local arbitration runs from different - // query memory pools can run in parallel but the global ones has to run with - // one at a time. - mutable std::shared_mutex arbitrationLock_; + tsan_atomic globalArbitrationRuns_{0}; + tsan_atomic globalArbitrationTimeMs_{0}; + tsan_atomic globalArbitrationBytes_{0}; std::atomic_uint64_t numRequests_{0}; - std::atomic_uint32_t numPending_{0}; - tsan_atomic numAborted_{0}; + std::atomic_uint32_t numRunning_{0}; + std::atomic_uint64_t numAborted_{0}; std::atomic_uint64_t numFailures_{0}; - tsan_atomic reclaimedFreeBytes_{0}; - tsan_atomic reclaimedUsedBytes_{0}; - tsan_atomic numNonReclaimableAttempts_{0}; + std::atomic_uint64_t reclaimedFreeBytes_{0}; + std::atomic_uint64_t reclaimedUsedBytes_{0}; + std::atomic_uint64_t numNonReclaimableAttempts_{0}; + + friend class GlobalArbitrationSection; + friend class test::SharedArbitratorTestHelper; }; } // namespace facebook::velox::memory diff --git a/velox/common/memory/tests/ArbitrationParticipantTest.cpp b/velox/common/memory/tests/ArbitrationParticipantTest.cpp index cb2242f854d7..44d8629c3677 100644 --- a/velox/common/memory/tests/ArbitrationParticipantTest.cpp +++ b/velox/common/memory/tests/ArbitrationParticipantTest.cpp @@ -100,6 +100,8 @@ constexpr uint64_t kMemoryPoolReservedCapacity = 64 * MB; constexpr uint64_t kMemoryPoolMinFreeCapacity = 32 * MB; constexpr double kMemoryPoolMinFreeCapacityRatio = 0.25; constexpr uint64_t kFastExponentialGrowthCapacityLimit = 256 * MB; +constexpr double kMemoryPoolMinReclaimBytes = 0; +constexpr uint64_t kMemoryPoolAbortCapacityLimit = 0; constexpr double kSlowCapacityGrowRatio = 0.25; class MemoryReclaimer; @@ -407,14 +409,18 @@ static ArbitrationParticipant::Config arbitrationConfig( kFastExponentialGrowthCapacityLimit, double slowCapacityGrowRatio = kSlowCapacityGrowRatio, uint64_t minFreeCapacity = kMemoryPoolMinFreeCapacity, - double minFreeCapacityRatio = kMemoryPoolMinFreeCapacityRatio) { + double minFreeCapacityRatio = kMemoryPoolMinFreeCapacityRatio, + uint64_t minReclaimBytes = kMemoryPoolMinReclaimBytes, + uint64_t abortCapacityLimit = kMemoryPoolAbortCapacityLimit) { return ArbitrationParticipant::Config{ 0, minCapacity, fastExponentialGrowthCapacityLimit, slowCapacityGrowRatio, minFreeCapacity, - minFreeCapacityRatio}; + minFreeCapacityRatio, + minReclaimBytes, + abortCapacityLimit}; } TEST_F(ArbitrationParticipantTest, config) { @@ -425,18 +431,22 @@ TEST_F(ArbitrationParticipantTest, config) { double slowCapacityGrowRatio; uint64_t minFreeCapacity; double minFreeCapacityRatio; + uint64_t minReclaimBytes; + uint64_t abortCapacityLimit; bool expectedError; std::string expectedToString; std::string debugString() const { return fmt::format( - "initCapacity {}, minCapacity {}, fastExponentialGrowthCapacityLimit: {}, slowCapacityGrowRatio: {}, minFreeCapacity: {}, minFreeCapacityRatio: {}, expectedError: {}, expectedToString: {}", + "initCapacity {}, minCapacity {}, fastExponentialGrowthCapacityLimit {}, slowCapacityGrowRatio {}, minFreeCapacity {}, minFreeCapacityRatio {}, minReclaimBytes {}, abortCapacityLimit {}, expectedError {}, expectedToString: {}", succinctBytes(initCapacity), succinctBytes(minCapacity), succinctBytes(fastExponentialGrowthCapacityLimit), slowCapacityGrowRatio, succinctBytes(minFreeCapacity), minFreeCapacityRatio, + succinctBytes(minReclaimBytes), + succinctBytes(abortCapacityLimit), expectedError, expectedToString); } @@ -447,16 +457,20 @@ TEST_F(ArbitrationParticipantTest, config) { 0.1, 1, 0.1, + 1, + 2, false, - "initCapacity 1B, minCapacity 1B, fastExponentialGrowthCapacityLimit 1B, slowCapacityGrowRatio 0.1, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, + "initCapacity 1B, minCapacity 1B, fastExponentialGrowthCapacityLimit 1B, slowCapacityGrowRatio 0.1, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 1B, abortCapacityLimit 2B"}, {0, 1, 0, 0, 1, 0.1, + 1, + 0, false, - "initCapacity 0B, minCapacity 1B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, + "initCapacity 0B, minCapacity 1B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 1B, abortCapacityLimit 0B"}, {0, 1, 0, @@ -464,61 +478,77 @@ TEST_F(ArbitrationParticipantTest, config) { 0, 0, false, - "initCapacity 0B, minCapacity 1B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 0B, minFreeCapacityRatio 0"}, + 1, + 0, + "initCapacity 0B, minCapacity 1B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 0B, minFreeCapacityRatio 0, minReclaimBytes 0B, abortCapacityLimit 1B"}, {1, 1, 0, 0, 1, 0.1, + 1, + 0, false, - "initCapacity 1B, minCapacity 1B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, + "initCapacity 1B, minCapacity 1B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 1B, abortCapacityLimit 0B"}, {1, 0, 1, 0.1, 1, 0.1, + 1, + 0, false, - "initCapacity 1B, minCapacity 0B, fastExponentialGrowthCapacityLimit 1B, slowCapacityGrowRatio 0.1, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, + "initCapacity 1B, minCapacity 0B, fastExponentialGrowthCapacityLimit 1B, slowCapacityGrowRatio 0.1, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 1B, abortCapacityLimit 0B"}, {1, 0, 0, 0, 1, 0.1, + 0, + 1, false, - "initCapacity 1B, minCapacity 0B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, + "initCapacity 1B, minCapacity 0B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 0B, abortCapacityLimit 1B"}, {0, 0, 0, 0, 0, 0, + 1, + 0, false, - "initCapacity 0B, minCapacity 0B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 0B, minFreeCapacityRatio 0"}, + "initCapacity 0B, minCapacity 0B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 0B, minFreeCapacityRatio 0, minReclaimBytes 1B, abortCapacityLimit 0B"}, {0, 0, 0, 0, 1, 0.1, + 1, + 0, false, - "initCapacity 0B, minCapacity 0B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, - {0, 1, 0, 0.1, 1, 0.1, true, ""}, - {0, 1, 1, 0.1, 0, 0.1, true, ""}, - {0, 1, 1, 0.1, 1, 0, true, ""}, + "initCapacity 0B, minCapacity 0B, fastExponentialGrowthCapacityLimit 0B, slowCapacityGrowRatio 0, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 1B, abortCapacityLimit 0B"}, + {0, 1, 0, 0.1, 1, 0.1, 1, 2, true, ""}, + {0, 1, 1, 0.1, 0, 0.1, 1, 2, true, ""}, + {0, 1, 1, 0.1, 1, 0, 1, 2, true, ""}, {1, 1, 1, 2, 1, 0.1, + 0, + 0, false, - "initCapacity 1B, minCapacity 1B, fastExponentialGrowthCapacityLimit 1B, slowCapacityGrowRatio 2, minFreeCapacity 1B, minFreeCapacityRatio 0.1"}, - {0, 1, 1, -1, 1, 0.1, true, ""}, - {0, 1, 1, 0.1, 1, 2, true, ""}, - {0, 1, 1, 0.1, 1, -1, true, ""}}; + "initCapacity 1B, minCapacity 1B, fastExponentialGrowthCapacityLimit 1B, slowCapacityGrowRatio 2, minFreeCapacity 1B, minFreeCapacityRatio 0.1, minReclaimBytes 0B, abortCapacityLimit 0B"}, + {0, 1, 1, -1, 1, 0.1, 1, 0, true, ""}, + {0, 1, 1, 0.1, 1, 2, 1, 0, true, ""}, + {0, 1, 1, 0.1, 1, -1, 1, 0, true, ""}, + {0, 0, 0, 0, 1, 0.1, 0, 3, true, ""}, + {0, 0, 0, 0, 1, 0.1, 1, 3, true, ""}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); @@ -530,7 +560,9 @@ TEST_F(ArbitrationParticipantTest, config) { testData.fastExponentialGrowthCapacityLimit, testData.slowCapacityGrowRatio, testData.minFreeCapacity, - testData.minFreeCapacityRatio), + testData.minFreeCapacityRatio, + testData.minReclaimBytes, + testData.abortCapacityLimit), ""); continue; } @@ -540,7 +572,9 @@ TEST_F(ArbitrationParticipantTest, config) { testData.fastExponentialGrowthCapacityLimit, testData.slowCapacityGrowRatio, testData.minFreeCapacity, - testData.minFreeCapacityRatio); + testData.minFreeCapacityRatio, + testData.minReclaimBytes, + testData.abortCapacityLimit); ASSERT_EQ(testData.initCapacity, config.initCapacity); ASSERT_EQ(testData.minCapacity, config.minCapacity); ASSERT_EQ( @@ -549,6 +583,8 @@ TEST_F(ArbitrationParticipantTest, config) { ASSERT_EQ(testData.slowCapacityGrowRatio, config.slowCapacityGrowRatio); ASSERT_EQ(testData.minFreeCapacity, config.minFreeCapacity); ASSERT_EQ(testData.minFreeCapacityRatio, config.minFreeCapacityRatio); + ASSERT_EQ(testData.minReclaimBytes, config.minReclaimBytes); + ASSERT_EQ(testData.abortCapacityLimit, config.abortCapacityLimit); ASSERT_EQ(config.toString(), testData.expectedToString); } } @@ -867,6 +903,8 @@ TEST_F(ArbitrationParticipantTest, reclaimableFreeCapacityAndShrink) { } } +TEST_F(ArbitrationParticipantTest, minReclaimBytesStats) {} + TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { struct { uint64_t minCapacity; @@ -896,7 +934,7 @@ TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { {128 << 20, 0, 0.0, 128 << 20, 0, 0, 0, 0, 0}, {128 << 20, 0, 0.0, 128 << 20, 0, 32 << 20, 0, 0, 0}, {128 << 20, 0, 0.0, 128 << 20, 32 << 20, 0, 0, 0, 32 << 20}, - {64 << 20, 0, 0.0, 128 << 20, 96 << 20, 0, 64 << 20, 96 << 20, 32 << 20}, + {64 << 20, 0, 0.0, 128 << 20, 96 << 20, 0, 64 << 20, 64 << 20, 32 << 20}, {64 << 20, 0, 0.0, 128 << 20, 128 << 20, 0, 64 << 20, 64 << 20, 64 << 20}, {0, 32 << 20, 0.25, 128 << 20, 0, 0, 0, 0}, {0, 64 << 20, 0.25, 128 << 20, 0, 0, 0, 0}, @@ -940,7 +978,7 @@ TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { 64 << 20, 64 << 20, 32 << 20, - 96 << 20, + 32 << 20, 32 << 20}, {32 << 20, 32 << 20, @@ -949,7 +987,7 @@ TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { 256 << 20, 0, 224 << 20, - 224 << 20, + 192 << 20, 32 << 20}, {32 << 20, 64 << 20, @@ -958,7 +996,7 @@ TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { 256 << 20, 0, 224 << 20, - 224 << 20, + 192 << 20, 32 << 20}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); @@ -969,7 +1007,8 @@ TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { 0, 0.0, testData.minFreeCapacity, - testData.minFreeCapacityRatio); + testData.minFreeCapacityRatio, + 0); auto participant = ArbitrationParticipant::create(10, task->pool(), &config); auto scopedParticipant = participant->lock().value(); @@ -998,8 +1037,9 @@ TEST_F(ArbitrationParticipantTest, reclaimableUsedCapacityAndReclaim) { const auto targetBytes = scopedParticipant->reclaimableUsedCapacity(); const uint64_t prevReclaimedBytes = scopedParticipant->stats().reclaimedBytes; + memory::MemoryReclaimer::Stats stats; ASSERT_EQ( - scopedParticipant->reclaim(targetBytes, 1'000'000), + scopedParticipant->reclaim(targetBytes, 1'000'000, stats), testData.expectedActualReclaimedBytes); ASSERT_EQ( scopedParticipant->pool()->usedBytes(), testData.expectedUsedBytes); @@ -1317,7 +1357,8 @@ TEST_F(ArbitrationParticipantTest, abort) { ASSERT_TRUE(scopedParticipant->aborted()); ASSERT_EQ(scopedParticipant->capacity(), 0); - ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000), 0); + memory::MemoryReclaimer::Stats stats; + ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000, stats), 0); ASSERT_EQ(scopedParticipant->stats().numReclaims, prevNumReclaims + 1); ASSERT_EQ(scopedParticipant->stats().numShrinks, prevNumShrunks + 2); } @@ -1375,7 +1416,9 @@ DEBUG_ONLY_TEST_F(ArbitrationParticipantTest, reclaimLock) { std::atomic_bool reclaim1CompletedFlag{false}; folly::EventCount reclaim1CompletedWait; std::thread reclaimThread1([&]() { - ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000), 0); + memory::MemoryReclaimer::Stats stats; + ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000, stats), 0); + ASSERT_EQ(stats.numNonReclaimableAttempts, 0); reclaim1CompletedFlag = true; reclaim1CompletedWait.notifyAll(); }); @@ -1408,7 +1451,9 @@ DEBUG_ONLY_TEST_F(ArbitrationParticipantTest, reclaimLock) { std::atomic_bool reclaim2CompletedFlag{false}; folly::EventCount reclaim2CompletedWait; std::thread reclaimThread2([&]() { - ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000), 0); + memory::MemoryReclaimer::Stats stats; + ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000, stats), 0); + ASSERT_EQ(stats.numNonReclaimableAttempts, 0); reclaim2CompletedFlag = true; reclaim2CompletedWait.notifyAll(); }); @@ -1492,7 +1537,8 @@ DEBUG_ONLY_TEST_F(ArbitrationParticipantTest, waitForReclaimOrAbort) { std::thread reclaimThread([&]() { if (testData.pendingReclaim) { - ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000), MB); + memory::MemoryReclaimer::Stats stats; + ASSERT_EQ(scopedParticipant->reclaim(MB, 1'000'000, stats), MB); } else { const std::string abortReason = "test abort"; try { @@ -1510,6 +1556,56 @@ DEBUG_ONLY_TEST_F(ArbitrationParticipantTest, waitForReclaimOrAbort) { } } +// This test verifies the aborted returns true until the participant has been +// aborted. +DEBUG_ONLY_TEST_F(ArbitrationParticipantTest, abortedCheck) { + std::atomic_bool abortWaitFlag{true}; + folly::EventCount abortWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::ArbitrationParticipant::abortLocked", + std::function( + ([&](ArbitrationParticipant* /*unused*/) { + if (!abortWaitFlag) { + return; + } + abortWait.await([&]() { return !abortWaitFlag.load(); }); + }))); + + auto task = createTask(kMemoryCapacity); + const auto config = arbitrationConfig(); + auto participant = ArbitrationParticipant::create(10, task->pool(), &config); + task->allocate(MB); + auto scopedParticipant = participant->lock().value(); + + std::thread abortThread1([&]() { + const std::string abortReason = "test abort1"; + try { + VELOX_FAIL(abortReason); + } catch (const VeloxRuntimeError& e) { + ASSERT_EQ(scopedParticipant->abort(std::current_exception()), MB); + } + }); + std::thread abortThread2([&]() { + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + const std::string abortReason = "test abort2"; + try { + VELOX_FAIL(abortReason); + } catch (const VeloxRuntimeError& e) { + ASSERT_EQ(scopedParticipant->abort(std::current_exception()), 0); + } + }); + ASSERT_FALSE(scopedParticipant->aborted()); + std::this_thread::sleep_for(std::chrono::seconds(1)); // NOLINT + ASSERT_FALSE(scopedParticipant->aborted()); + abortWaitFlag = false; + abortWait.notifyAll(); + abortThread1.join(); + ASSERT_TRUE(scopedParticipant->aborted()); + abortThread2.join(); + ASSERT_TRUE(scopedParticipant->aborted()); + VELOX_ASSERT_THROW(task->allocate(MB), "test abort1"); +} + TEST_F(ArbitrationParticipantTest, capacityCheck) { auto task = createTask(256 << 20); const auto config = arbitrationConfig(512 << 20); @@ -1602,8 +1698,8 @@ TEST_F(ArbitrationParticipantTest, arbitrationOperation) { ASSERT_EQ(op.state(), ArbitrationOperation::State::kRunning); std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT - op.startGlobalArbitration(); - VELOX_ASSERT_THROW(op.startGlobalArbitration(), ""); + op.recordGlobalArbitrationStartTime(); + VELOX_ASSERT_THROW(op.recordGlobalArbitrationStartTime(), ""); VELOX_ASSERT_THROW(op.stats(), "(running vs. finished)"); std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT @@ -1611,7 +1707,7 @@ TEST_F(ArbitrationParticipantTest, arbitrationOperation) { ASSERT_EQ(op.state(), ArbitrationOperation::State::kFinished); ASSERT_FALSE(scopedParticipant->hasRunningOp()); ASSERT_EQ(scopedParticipant->numWaitingOps(), 0); - VELOX_ASSERT_THROW(op.startGlobalArbitration(), ""); + VELOX_ASSERT_THROW(op.recordGlobalArbitrationStartTime(), ""); ASSERT_FALSE(op.hasTimeout()); const auto execTimeMs = op.executionTimeMs(); std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT @@ -1683,7 +1779,7 @@ TEST_F(ArbitrationParticipantTest, arbitrationOperationStats) { std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT op.start(); std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT - op.startGlobalArbitration(); + op.recordGlobalArbitrationStartTime(); std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT op.finish(); const auto stats = op.stats(); diff --git a/velox/common/memory/tests/MemoryArbitratorTest.cpp b/velox/common/memory/tests/MemoryArbitratorTest.cpp index 20ec49ab0005..b4c790c7b856 100644 --- a/velox/common/memory/tests/MemoryArbitratorTest.cpp +++ b/velox/common/memory/tests/MemoryArbitratorTest.cpp @@ -148,18 +148,14 @@ TEST_F(MemoryArbitrationTest, queryMemoryCapacity) { "arbitration. Requestor pool name 'leaf-1.0', request size 7.00MB, " "memory pool capacity 4.00MB, memory pool max capacity 8.00MB"); ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 0), 0); - ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 0); - ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 1), 0); + VELOX_ASSERT_THROW( + manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), ""); ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 1), 0); ASSERT_EQ(rootPool->capacity(), 4 << 20); static_cast(rootPool.get())->testingSetReservation(0); ASSERT_EQ( - manager.arbitrator()->shrinkCapacity(leafPool.get(), 1 << 20), 1 << 20); - ASSERT_EQ( - manager.arbitrator()->shrinkCapacity(rootPool.get(), 1 << 20), 1 << 20); - ASSERT_EQ(rootPool->capacity(), 2 << 20); - ASSERT_EQ(leafPool->capacity(), 2 << 20); - ASSERT_EQ(manager.arbitrator()->shrinkCapacity(leafPool.get(), 0), 2 << 20); + manager.arbitrator()->shrinkCapacity(rootPool.get(), 1 << 20), 4 << 20); + ASSERT_EQ(manager.arbitrator()->shrinkCapacity(rootPool.get(), 1 << 20), 0); ASSERT_EQ(rootPool->capacity(), 0); ASSERT_EQ(leafPool->capacity(), 0); } diff --git a/velox/common/memory/tests/MemoryManagerTest.cpp b/velox/common/memory/tests/MemoryManagerTest.cpp index e139a2cc1858..0b01e1cc71f0 100644 --- a/velox/common/memory/tests/MemoryManagerTest.cpp +++ b/velox/common/memory/tests/MemoryManagerTest.cpp @@ -98,13 +98,7 @@ TEST_F(MemoryManagerTest, ctor) { ASSERT_EQ(arbitrator->stats().maxCapacityBytes, kCapacity); ASSERT_EQ( manager.toString(), - "Memory Manager[capacity 4.00GB alignment 64B usedBytes 0B number of " - "pools 2\nList of root pools:\n\t__sys_root__\n" - "Memory Allocator[MALLOC capacity 4.00GB allocated bytes 0 " - "allocated pages 0 mapped pages 0]\n" - "ARBITRATOR[SHARED CAPACITY[4.00GB] PENDING[0] " - "numRequests 0 numRunning 0 numSucceded 0 numAborted 0 numFailures 0 numNonReclaimableAttempts 0 " - "reclaimedFreeCapacity 0B reclaimedUsedCapacity 0B maxCapacity 4.00GB freeCapacity 4.00GB freeReservedCapacity 0B]]"); + "Memory Manager[capacity 4.00GB alignment 64B usedBytes 0B number of pools 2\nList of root pools:\n\t__sys_root__\nMemory Allocator[MALLOC capacity 4.00GB allocated bytes 0 allocated pages 0 mapped pages 0]\nARBITRATOR[SHARED CAPACITY[4.00GB] numRequests 0 numRunning 0 numSucceded 0 numAborted 0 numFailures 0 numNonReclaimableAttempts 0 reclaimedFreeCapacity 0B reclaimedUsedCapacity 0B maxCapacity 4.00GB freeCapacity 4.00GB freeReservedCapacity 0B]]"); } } @@ -652,14 +646,23 @@ TEST_F(MemoryManagerTest, disableMemoryPoolTracking) { options.allocatorCapacity = 64LL << 20; options.arbitratorCapacity = 64LL << 20; std::vector arbitratorKinds{kNoopKind, kSharedKind}; - for (auto arbitratorKind : arbitratorKinds) { + for (const auto& arbitratorKind : arbitratorKinds) { options.arbitratorKind = arbitratorKind; MemoryManager manager{options}; auto root0 = manager.addRootPool("root_0", 35LL << 20); auto leaf0 = root0->addLeafChild("leaf_0"); - // Not throwing since there is no duplicate check. - auto root0Dup = manager.addRootPool("root_0", 35LL << 20); + std::shared_ptr root0Dup; + if (arbitratorKind == kSharedKind) { + // NOTE: shared arbitrator has duplicate check inside. + VELOX_ASSERT_THROW( + manager.addRootPool("root_0", 35LL << 20), + "Memory pool root_0 already exists"); + continue; + } else { + // Not throwing since there is no duplicate check. + root0Dup = manager.addRootPool("root_0", 35LL << 20); + } // 1TB capacity is allowed since there is no capacity check. auto root1 = manager.addRootPool("root_1", 1LL << 40); diff --git a/velox/common/memory/tests/MemoryPoolTest.cpp b/velox/common/memory/tests/MemoryPoolTest.cpp index 22807ff96a75..aa263e94b4dd 100644 --- a/velox/common/memory/tests/MemoryPoolTest.cpp +++ b/velox/common/memory/tests/MemoryPoolTest.cpp @@ -3585,8 +3585,19 @@ TEST_P(MemoryPoolTest, abortAPI) { { auto rootPool = manager.addRootPool("abortAPI", capacity); ASSERT_FALSE(rootPool->aborted()); - VELOX_ASSERT_THROW(abortPool(rootPool.get()), ""); - ASSERT_FALSE(rootPool->aborted()); + abortPool(rootPool.get()); + ASSERT_TRUE(rootPool->aborted()); + auto leafPool = rootPool->addLeafChild("leafAbortAPI", true); + ASSERT_TRUE(leafPool->aborted()); + ASSERT_EQ(leafPool->capacity(), capacity); + if (capacity != kMaxMemory) { + VELOX_ASSERT_THROW( + leafPool->allocate(leafPool->capacity() + 1), + "Manual MemoryPool Abortion"); + } + VELOX_ASSERT_THROW( + abortPool(rootPool.get()), + "Trying to set another abort error on an already aborted pool."); } // The root memory pool with no child pool and default memory reclaimer. { @@ -3824,16 +3835,19 @@ TEST_P(MemoryPoolTest, abort) { // Abort the pool. ContinueFuture future; if (!hasReclaimer) { - VELOX_ASSERT_THROW(abortPool(leafPool.get()), ""); - VELOX_ASSERT_THROW(abortPool(aggregatePool.get()), ""); - VELOX_ASSERT_THROW(abortPool(rootPool.get()), ""); - ASSERT_FALSE(leafPool->aborted()); - ASSERT_FALSE(aggregatePool->aborted()); - ASSERT_FALSE(rootPool->aborted()); - leafPool->free(buf1, 128); - buf1 = leafPool->allocate(capacity / 2); - leafPool->free(buf1, capacity / 2); - continue; + abortPool(leafPool.get()); + ASSERT_TRUE(leafPool->aborted()); + VELOX_ASSERT_THROW( + abortPool(aggregatePool.get()), + "Trying to set another abort error on an already aborted pool."); + VELOX_ASSERT_THROW( + abortPool(rootPool.get()), + "Trying to set another abort error on an already aborted pool."); + ASSERT_TRUE(leafPool->aborted()); + ASSERT_TRUE(aggregatePool->aborted()); + ASSERT_TRUE(rootPool->aborted()); + VELOX_ASSERT_THROW( + leafPool->allocate(capacity / 2), "Manual MemoryPool Abortion"); } else { abortPool(leafPool.get()); } diff --git a/velox/common/memory/tests/MockSharedArbitratorTest.cpp b/velox/common/memory/tests/MockSharedArbitratorTest.cpp index dddeae735538..6111957a1bef 100644 --- a/velox/common/memory/tests/MockSharedArbitratorTest.cpp +++ b/velox/common/memory/tests/MockSharedArbitratorTest.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryArbitrator.h" #include "velox/common/memory/SharedArbitrator.h" +#include "velox/common/memory/tests/SharedArbitratorTestUtil.h" #include "velox/common/testutil/TestValue.h" #include "velox/exec/OperatorUtils.h" #include "velox/exec/tests/utils/PlanBuilder.h" @@ -69,12 +71,16 @@ constexpr uint64_t kFastExponentialGrowthCapacityLimit = 32 * MB; constexpr double kSlowCapacityGrowPct = 0.25; constexpr uint64_t kMemoryPoolMinFreeCapacity = 8 * MB; constexpr double kMemoryPoolMinFreeCapacityPct = 0.25; +// constexpr uint64_t kMemoryPoolMinReclaimBytes = 8 * MB; +// constexpr uint64_t kMemoryPoolAbortCapacityLimit = 16 * MB; +constexpr double kGlobalArbitrationReclaimPct = 10; +constexpr double kGlobalArbitrationReclaimThreadsHwMultiplier = 0.5; class MemoryReclaimer; class MockMemoryOperator; using ReclaimInjectionCallback = - std::function; + std::function; using ArbitrationInjectionCallback = std::function; struct Allocation { @@ -125,6 +131,10 @@ class MockTask : public std::enable_shared_from_this { return root_->capacity(); } + uint64_t usedBytes() const { + return root_->usedBytes(); + } + MockMemoryOperator* addMemoryOp( bool isReclaimable = true, ReclaimInjectionCallback reclaimInjectCb = nullptr, @@ -195,7 +205,10 @@ class MockMemoryOperator { return 0; } if (reclaimInjectCb_ != nullptr) { - reclaimInjectCb_(pool, targetBytes); + uint64_t injectedReclaimedBytes{0}; + if (!reclaimInjectCb_(pool, targetBytes)) { + return 0; + } } reclaimTargetBytes_.push_back(targetBytes); auto reclaimBytes = op_->reclaim(pool, targetBytes); @@ -423,16 +436,21 @@ class MockSharedArbitrationTest : public testing::Test { void setupMemory( int64_t memoryCapacity = kMemoryCapacity, - int64_t reservedMemoryCapacity = kReservedMemoryCapacity, - uint64_t memoryPoolInitCapacity = kMemoryPoolInitCapacity, - uint64_t memoryPoolReserveCapacity = kMemoryPoolReservedCapacity, - uint64_t fastExponentialGrowthCapacityLimit = - kFastExponentialGrowthCapacityLimit, - double slowCapacityGrowPct = kSlowCapacityGrowPct, - uint64_t memoryPoolMinFreeCapacity = kMemoryPoolMinFreeCapacity, - double memoryPoolMinFreeCapacityPct = kMemoryPoolMinFreeCapacityPct, + int64_t reservedMemoryCapacity = 0, + uint64_t memoryPoolInitCapacity = 0, + uint64_t memoryPoolReserveCapacity = 0, + uint64_t fastExponentialGrowthCapacityLimit = 0, + double slowCapacityGrowPct = 0, + uint64_t memoryPoolMinFreeCapacity = 0, + double memoryPoolMinFreeCapacityPct = 0, + uint64_t memoryPoolMinReclaimBytes = 0, + uint64_t memoryPoolAbortCapacityLimit = 0, + double globalArbitrationReclaimPct = 0, + double globalArbitrationReclaimThreadsHwMultiplier = + kGlobalArbitrationReclaimThreadsHwMultiplier, std::function arbitrationStateCheckCb = nullptr, - bool globalArtbitrationEnabled = true) { + bool globalArtbitrationEnabled = true, + uint64_t arbitrationTimeoutMs = 5 * 60 * 1'000) { MemoryManagerOptions options; options.allocatorCapacity = memoryCapacity; std::string arbitratorKind = "SHARED"; @@ -454,9 +472,18 @@ class MockSharedArbitrationTest : public testing::Test { folly::to(memoryPoolMinFreeCapacity) + "B"}, {std::string(ExtraConfig::kMemoryPoolMinFreeCapacityPct), folly::to(memoryPoolMinFreeCapacityPct)}, + {std::string(ExtraConfig::kMemoryPoolMinReclaimBytes), + folly::to(memoryPoolMinReclaimBytes) + "B"}, + {std::string(ExtraConfig::kMemoryPoolAbortCapacityLimit), + folly::to(memoryPoolAbortCapacityLimit) + "B"}, + {std::string(ExtraConfig::kGlobalArbitrationMemoryReclaimPct), + folly::to(globalArbitrationReclaimPct)}, + {std::string(ExtraConfig::kGlobalArbitrationReclaimThreadsHwMultiplier), + folly::to(globalArbitrationReclaimThreadsHwMultiplier)}, + {std::string(ExtraConfig::kMemoryReclaimMaxWaitTime), + folly::to(arbitrationTimeoutMs) + "ms"}, {std::string(ExtraConfig::kGlobalArbitrationEnabled), folly::to(globalArtbitrationEnabled)}}; - options.arbitrationStateCheckCb = std::move(arbitrationStateCheckCb); options.checkUsageLeak = true; manager_ = std::make_unique(options); @@ -547,13 +574,28 @@ TEST_F(MockSharedArbitrationTest, extraConfigs) { 256 << 20); ASSERT_EQ( SharedArbitrator::ExtraConfig::memoryReclaimMaxWaitTimeMs(emptyConfigs), - 0); + 300'000); ASSERT_EQ( SharedArbitrator::ExtraConfig::globalArbitrationEnabled(emptyConfigs), SharedArbitrator::ExtraConfig::kDefaultGlobalArbitrationEnabled); ASSERT_EQ( SharedArbitrator::ExtraConfig::checkUsageLeak(emptyConfigs), SharedArbitrator::ExtraConfig::kDefaultCheckUsageLeak); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::memoryPoolMinReclaimBytes(emptyConfigs), + 128 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::memoryPoolAbortCapacityLimit(emptyConfigs), + 1LL << 30); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::globalArbitrationMemoryReclaimPct( + emptyConfigs), + SharedArbitrator::ExtraConfig::kDefaultGlobalMemoryArbitrationReclaimPct); + ASSERT_EQ( + SharedArbitrator::ExtraConfig:: + globalArbitrationReclaimThreadsHwMultiplier(emptyConfigs), + SharedArbitrator::ExtraConfig:: + kDefaultGlobalArbitrationReclaimThreadsHwMultiplier); // Testing custom values std::unordered_map configs; @@ -569,6 +611,16 @@ TEST_F(MockSharedArbitrationTest, extraConfigs) { SharedArbitrator::ExtraConfig::kGlobalArbitrationEnabled)] = "true"; configs[std::string(SharedArbitrator::ExtraConfig::kCheckUsageLeak)] = "false"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolMinReclaimBytes)] = "64mb"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolAbortCapacityLimit)] = "256mb"; + configs[std::string( + SharedArbitrator::ExtraConfig::kGlobalArbitrationMemoryReclaimPct)] = + "30"; + configs[std::string(SharedArbitrator::ExtraConfig:: + kGlobalArbitrationReclaimThreadsHwMultiplier)] = + "1.0"; ASSERT_EQ(SharedArbitrator::ExtraConfig::reservedCapacity(configs), 100); ASSERT_EQ( SharedArbitrator::ExtraConfig::memoryPoolInitialCapacity(configs), @@ -579,6 +631,19 @@ TEST_F(MockSharedArbitrationTest, extraConfigs) { SharedArbitrator::ExtraConfig::memoryReclaimMaxWaitTimeMs(configs), 5000); ASSERT_TRUE(SharedArbitrator::ExtraConfig::globalArbitrationEnabled(configs)); ASSERT_FALSE(SharedArbitrator::ExtraConfig::checkUsageLeak(configs)); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::memoryPoolMinReclaimBytes(configs), + 64 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::memoryPoolAbortCapacityLimit(configs), + 256 << 20); + ASSERT_EQ( + SharedArbitrator::ExtraConfig::globalArbitrationMemoryReclaimPct(configs), + 30); + ASSERT_EQ( + SharedArbitrator::ExtraConfig:: + globalArbitrationReclaimThreadsHwMultiplier(configs), + 1.0); // Testing invalid values configs[std::string(SharedArbitrator::ExtraConfig::kReservedCapacity)] = @@ -593,6 +658,18 @@ TEST_F(MockSharedArbitrationTest, extraConfigs) { SharedArbitrator::ExtraConfig::kGlobalArbitrationEnabled)] = "invalid"; configs[std::string(SharedArbitrator::ExtraConfig::kCheckUsageLeak)] = "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolMinReclaimBytes)] = "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kMemoryPoolAbortCapacityLimit)] = + "invalid"; + configs[std::string( + SharedArbitrator::ExtraConfig::kGlobalArbitrationMemoryReclaimPct)] = + "invalid"; + configs[std::string(SharedArbitrator::ExtraConfig:: + kGlobalArbitrationReclaimThreadsHwMultiplier)] = + "invalid"; + VELOX_ASSERT_THROW( SharedArbitrator::ExtraConfig::reservedCapacity(configs), "Invalid capacity string 'invalid'"); @@ -611,9 +688,43 @@ TEST_F(MockSharedArbitrationTest, extraConfigs) { VELOX_ASSERT_THROW( SharedArbitrator::ExtraConfig::checkUsageLeak(configs), "Failed while parsing SharedArbitrator configs"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::memoryPoolMinReclaimBytes(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::memoryPoolMinReclaimBytes(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::memoryPoolAbortCapacityLimit(configs), + "Invalid capacity string 'invalid'"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig::globalArbitrationMemoryReclaimPct(configs), + "Failed while parsing SharedArbitrator configs"); + VELOX_ASSERT_THROW( + SharedArbitrator::ExtraConfig:: + globalArbitrationReclaimThreadsHwMultiplier(configs), + "Failed while parsing SharedArbitrator configs"); + // Invalid global arbitration reclaim executor hw multiplier. + VELOX_ASSERT_THROW( + setupMemory(kMemoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1), + "globalArbitrationReclaimThreadsHwMultiplier_ needs to be positive"); + // Invalid global arbitration reclaim pct. + VELOX_ASSERT_THROW( + setupMemory(kMemoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 200), + "(200 vs. 100) Invalid globalArbitrationMemoryReclaimPct"); + // Invalid max memory arbitration time. + VELOX_ASSERT_THROW( + setupMemory( + kMemoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, nullptr, false, 0), + "(0 vs. 0) maxArbitrationTimeMs can't be zero"); } TEST_F(MockSharedArbitrationTest, constructor) { + setupMemory( + kMemoryCapacity, + kReservedMemoryCapacity, + kMemoryPoolInitCapacity, + kMemoryPoolReservedCapacity); const int reservedCapacity = arbitrator_->stats().freeReservedCapacityBytes; const int nonReservedCapacity = arbitrator_->stats().freeCapacityBytes - reservedCapacity; @@ -650,7 +761,7 @@ TEST_F(MockSharedArbitrationTest, arbitrationStateCheck) { ASSERT_TRUE(RE2::FullMatch(pool.name(), re)) << pool.name(); ++checkCount; }; - setupMemory(memCapacity, 0, 0, 0, 0, 0, 0, 0, checkCountCb); + setupMemory(memCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, checkCountCb); const int numTasks{5}; std::vector> tasks; @@ -675,7 +786,7 @@ TEST_F(MockSharedArbitrationTest, arbitrationStateCheck) { MemoryArbitrationStateCheckCB badCheckCb = [&](MemoryPool& /*unused*/) { VELOX_FAIL("bad check"); }; - setupMemory(memCapacity, 0, 0, 0, 0, 0, 0, 0, badCheckCb); + setupMemory(memCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, badCheckCb); std::shared_ptr task = addTask(kMemoryCapacity); ASSERT_EQ(task->capacity(), 0); MockMemoryOperator* memOp = task->addMemoryOp(); @@ -689,8 +800,8 @@ TEST_F(MockSharedArbitrationTest, asyncArbitrationWork) { std::atomic_int reclaimedCount{0}; std::shared_ptr task = addTask(poolCapacity); - MockMemoryOperator* memoryOp = - addMemoryOp(task, true, [&](MemoryPool* pool, uint64_t /*unsed*/) { + MockMemoryOperator* memoryOp = addMemoryOp( + task, true, [&](MemoryPool* pool, uint64_t /*unsed*/) -> bool { struct Result { bool succeeded{true}; @@ -703,78 +814,87 @@ TEST_F(MockSharedArbitrationTest, asyncArbitrationWork) { executor_->add([&]() { asyncReclaimTask->prepare(); }); std::this_thread::sleep_for(std::chrono::seconds(1)); // NOLINT const auto result = asyncReclaimTask->move(); - ASSERT_TRUE(result->succeeded); + VELOX_CHECK(result->succeeded); memoryOp->freeAll(); ++reclaimedCount; + return true; }); memoryOp->allocate(poolCapacity); memoryOp->allocate(poolCapacity); ASSERT_EQ(reclaimedCount, 1); } -TEST_F(MockSharedArbitrationTest, arbitrationFailsTask) { - auto nonReclaimTask = addTask(328 * MB); - auto* nonReclaimOp = nonReclaimTask->addMemoryOp(false); - auto* buf = nonReclaimOp->allocate(320 * MB); - - // growTask is (192 + 128) = 320MB which is less than nonReclaimTask 384MB. - // This makes sure nonReclaimTask gets picked as the victim during - // handleOOM(). - auto growTask = addTask(328 * MB); - auto* growOp = growTask->addMemoryOp(false); - auto* bufGrow1 = growOp->allocate(64 * MB); - auto* bufGrow2 = growOp->allocate(128 * MB); - ASSERT_NE(nonReclaimTask->error(), nullptr); - try { - std::rethrow_exception(nonReclaimTask->error()); - } catch (const VeloxRuntimeError& e) { - ASSERT_EQ(velox::error_code::kMemAborted, e.errorCode()); - ASSERT_TRUE( - std::string(e.what()).find("aborted when requestor") != - std::string::npos); - } catch (...) { - FAIL(); - } - nonReclaimOp->freeAll(); - growOp->freeAll(); +// Test different kinds of arbitraton failures. +TEST_F(MockSharedArbitrationTest, arbitrationFailures) { + // Local arbitration failure with exceeded capacity limit. + { + auto task = addTask(64 * MB); + auto* op = task->addMemoryOp(false); + op->allocate(32 * MB); + VELOX_ASSERT_THROW( + op->allocate(64 * MB), + "Exceeded memory pool capacity after attempt to grow capacity"); + } + + // Global arbitration failure. + { + auto task1 = addTask(kMemoryCapacity / 2); + auto* op1 = task1->addMemoryOp(false); + op1->allocate(kMemoryCapacity / 2); + + auto task2 = addTask(kMemoryCapacity / 2); + auto* op2 = task2->addMemoryOp(false); + op2->allocate(kMemoryCapacity / 4); + + auto task3 = addTask(kMemoryCapacity / 2); + auto* op3 = task3->addMemoryOp(false); + op3->allocate(kMemoryCapacity / 4); + VELOX_ASSERT_THROW(op3->allocate(kMemoryCapacity / 4), "aborted"); + try { + std::rethrow_exception(task3->error()); + } catch (const VeloxRuntimeError& e) { + ASSERT_EQ(velox::error_code::kMemAborted, e.errorCode()); + ASSERT_TRUE( + std::string(e.what()).find( + "Memory pool aborted to reclaim used memory") != + std::string::npos) + << e.what(); + } catch (...) { + FAIL(); + } + } } TEST_F(MockSharedArbitrationTest, shrinkPools) { - const int64_t memoryCapacity = 32 << 20; - const int64_t reservedMemoryCapacity = 8 << 20; - const uint64_t memoryPoolInitCapacity = 8 << 20; - const uint64_t memoryPoolReserveCapacity = 2 << 20; - setupMemory( - memoryCapacity, - reservedMemoryCapacity, - memoryPoolInitCapacity, - memoryPoolReserveCapacity); + const int64_t memoryCapacity = 256 << 20; + const int64_t memoryPoolCapacity = 64 << 20; struct TestTask { uint64_t capacity{0}; bool reclaimable{false}; uint64_t allocateBytes{0}; - uint64_t expectedInitialCapacity{0}; + uint64_t expectedCapacityAfterShrink; + uint64_t expectedUsagedAfterShrink; bool expectedAbortAfterShrink{false}; std::string debugString() const { return fmt::format( - "capacity: {}, reclaimable: {}, allocateBytes: {}, expectedInitialCapacity: {}, expectedAbortAfterShrink: {}", + "capacity: {}, reclaimable: {}, allocateBytes: {}, expectedCapacityAfterShrink: {}, expectedUsagedAfterShrink: {}, expectedAbortAfterShrink: {}", succinctBytes(capacity), reclaimable, succinctBytes(allocateBytes), - succinctBytes(expectedInitialCapacity), + succinctBytes(expectedCapacityAfterShrink), + succinctBytes(expectedUsagedAfterShrink), expectedAbortAfterShrink); } }; struct { std::vector testTasks; + uint64_t memoryPoolInitCapacity; uint64_t targetBytes; - uint64_t expectedFreedBytes; - uint64_t expectedFreeCapacity; - uint64_t expectedReservedFreeCapacity; + uint64_t expectedReclaimedUsedBytes; bool allowSpill; bool allowAbort; @@ -786,266 +906,228 @@ TEST_F(MockSharedArbitrationTest, shrinkPools) { tasksOss << "], \n"; } return fmt::format( - "testTasks: \n[{}], \ntargetBytes: {}, \nexpectedFreedBytes: {}, " - "\nexpectedFreeCapacity: {}, \nexpectedReservedFreeCapacity: {}, \n" - "allowSpill: {}, \nallowAbort: {}", + "testTasks: \n[{}], \ntargetBytes: {}, expectedReclaimedUsedBytes: {}, " + "allowSpill: {}, allowAbort: {}", tasksOss.str(), succinctBytes(targetBytes), - succinctBytes(expectedFreedBytes), - succinctBytes(expectedFreeCapacity), - succinctBytes(expectedReservedFreeCapacity), + succinctBytes(expectedReclaimedUsedBytes), allowSpill, allowAbort); } } testSettings[] = { - {{{memoryPoolInitCapacity, + {{{memoryPoolCapacity, false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, - true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, + {memoryPoolCapacity, + false, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + false}, + {memoryPoolCapacity, + false, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + false}, + {memoryPoolCapacity, + false, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}}, + memoryPoolCapacity, 0, 0, - 6 << 20, - 6 << 20, true, false}, - - {{{memoryPoolInitCapacity, - true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - false}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, - true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, - false}}, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}}, + memoryPoolCapacity, 0, - 8 << 20, - 14 << 20, - reservedMemoryCapacity, + memoryCapacity, true, false}, - - {{{memoryPoolInitCapacity, - true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, + false, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, - true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, + false, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}}, + memoryPoolCapacity, 0, - 0, - 6 << 20, - 6 << 20, - false, + memoryCapacity / 2, + true, false}, - - {{{memoryPoolInitCapacity, + {{{memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}, + {memoryPoolCapacity, false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, - true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, + false, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}}, + memoryPoolCapacity, 0, - 0, - 6 << 20, - 6 << 20, + memoryCapacity / 2, true, false}, - - {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, true}}, - 0, - 26 << 20, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}}, + memoryPoolCapacity, memoryCapacity, - reservedMemoryCapacity, - false, - true}, - - {{{memoryPoolInitCapacity, - true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - false}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, - true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, - true}}, - 0, - 26 << 20, memoryCapacity, - reservedMemoryCapacity, true, - true}, - - {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, false}}, - 16 << 20, - 0, - 6 << 20, - 6 << 20, - false, false}, - - {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, false}}, - 16 << 20, - 16 << 20, - 22 << 20, - reservedMemoryCapacity, - true, - true}, - - {{{memoryPoolInitCapacity, true, 0, memoryPoolInitCapacity, false}, - {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, false, 1 << 10, memoryPoolInitCapacity, true}, - {memoryPoolInitCapacity, true, 0, memoryPoolReserveCapacity, false}}, - 14 << 20, - 16 << 20, - 22 << 20, - reservedMemoryCapacity, - true, - true}, - - {{{memoryPoolInitCapacity, - true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - false}, - {memoryPoolInitCapacity, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity / 2, + memoryPoolCapacity, + memoryPoolCapacity / 2, false}, - {memoryPoolInitCapacity, - false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - false}, - {memoryPoolInitCapacity, + {memoryPoolCapacity, true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, + memoryPoolCapacity / 2, + memoryPoolCapacity, + memoryPoolCapacity / 2, false}}, - 12 << 20, - 12 << 20, - 18 << 20, - reservedMemoryCapacity, + memoryPoolCapacity, + memoryCapacity / 2, + memoryCapacity / 2, true, false}, - - {{{memoryPoolInitCapacity, - true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - true}, - {memoryPoolInitCapacity, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - true}, - {memoryPoolInitCapacity, - false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - true}, - {memoryPoolInitCapacity, + memoryPoolCapacity / 2, + memoryPoolCapacity, + memoryPoolCapacity / 2, + false}, + {memoryPoolCapacity, true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, + memoryPoolCapacity / 2, + memoryPoolCapacity, + memoryPoolCapacity / 2, false}}, - 24 << 20, - 24 << 20, - 30 << 20, - reservedMemoryCapacity, - false, + memoryPoolCapacity, + memoryCapacity / 2, + memoryCapacity / 2, + true, true}, - - {{{memoryPoolInitCapacity, - false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - false}, - {memoryPoolInitCapacity, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}}, + memoryPoolCapacity, + 0, + memoryCapacity, + true, + true}, + {{{memoryPoolCapacity, false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}, - {memoryPoolInitCapacity, + {memoryPoolCapacity, false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}, - {memoryPoolInitCapacity, - false, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, - false}}, - 14 << 20, - 0, - 6 << 20, - 6 << 20, + {memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}}, + memoryPoolCapacity, + memoryCapacity / 2, + memoryCapacity / 2, true, - false}, - - {{{memoryPoolInitCapacity, - false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, - false}, - {memoryPoolInitCapacity, + true}, + {{{memoryPoolCapacity, false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, false}, - {memoryPoolInitCapacity, + // Global arbitration choose to abort the younger participant with same + // capacity bucket. + {memoryPoolCapacity, false, memoryPoolCapacity, 0, 0, true}, + {memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity / 2, 0, 0, false}}, + memoryPoolCapacity, + memoryCapacity / 2 + memoryPoolCapacity, + memoryCapacity / 2 + memoryPoolCapacity, + true, + true}, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, false}, + {memoryPoolCapacity, false, - memoryPoolInitCapacity, - memoryPoolInitCapacity, + memoryPoolCapacity / 2, + memoryPoolCapacity, + memoryPoolCapacity / 2, false}, - {memoryPoolInitCapacity, + // Global arbitration choose to abort the younger participant with same + // capacity bucket. + {memoryPoolCapacity, false, memoryPoolCapacity / 2, 0, 0, true}}, + memoryPoolCapacity, + memoryCapacity / 2 + memoryPoolCapacity / 2, + memoryCapacity / 2 + memoryPoolCapacity, + true, + true}, + + {{{memoryPoolCapacity, true, - memoryPoolReserveCapacity, - memoryPoolReserveCapacity, - false}}, - 14 << 20, + memoryPoolCapacity, + memoryPoolCapacity, + memoryPoolCapacity, + false}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, true}, + {memoryPoolCapacity, false, memoryPoolCapacity / 2, 0, 0, true}, + // Global arbitration choose to abort the younger participant with same + // capacity bucket. + {memoryPoolCapacity, false, memoryPoolCapacity / 2, 0, 0, true}}, + memoryPoolCapacity, + memoryCapacity / 2 + memoryPoolCapacity / 2, + memoryCapacity / 2 + memoryPoolCapacity, + false, + true}, + {{{memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, true}, + {memoryPoolCapacity, true, memoryPoolCapacity, 0, 0, true}, + {memoryPoolCapacity, false, memoryPoolCapacity / 2, 0, 0, true}, + // Global arbitration choose to abort the younger participant with same + // capacity bucket. + {memoryPoolCapacity, false, memoryPoolCapacity / 2, 0, 0, true}}, + memoryPoolCapacity, 0, - 6 << 20, - 6 << 20, - true, - false}}; + memoryCapacity, + false, + true}}; - struct MockTaskContainer { + struct TestTaskContainer { std::shared_ptr task; MockMemoryOperator* op; TestTask testTask; @@ -1060,28 +1142,34 @@ TEST_F(MockSharedArbitrationTest, shrinkPools) { ASSERT_NE(task->error(), nullptr); VELOX_ASSERT_THROW( std::rethrow_exception(task->error()), - "Memory pool aborted to reclaim used memory, current usage"); + "Memory pool aborted to reclaim used memory, current capacity"); }; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - std::vector taskContainers; + // Make simple settings to focus shrink capacity logic testing. + setupMemory(memoryCapacity, 0, testData.memoryPoolInitCapacity); + std::vector taskContainers; for (const auto& testTask : testData.testTasks) { auto task = addTask(testTask.capacity); auto* op = addMemoryOp(task, testTask.reclaimable); - ASSERT_EQ(op->capacity(), testTask.expectedInitialCapacity); + ASSERT_EQ(op->capacity(), testTask.capacity); if (testTask.allocateBytes != 0) { op->allocate(testTask.allocateBytes); } - ASSERT_LE(op->capacity(), testTask.capacity); + ASSERT_EQ(task->capacity(), testTask.capacity); + ASSERT_LE(task->usedBytes(), testTask.capacity); taskContainers.push_back({task, op, testTask}); } ASSERT_EQ( manager_->shrinkPools( testData.targetBytes, testData.allowSpill, testData.allowAbort), - testData.expectedFreedBytes); + testData.expectedReclaimedUsedBytes); + ASSERT_EQ( + arbitrator_->stats().reclaimedUsedBytes, + testData.expectedReclaimedUsedBytes); for (const auto& taskContainer : taskContainers) { checkTaskException( @@ -1089,29 +1177,22 @@ TEST_F(MockSharedArbitrationTest, shrinkPools) { taskContainer.testTask.expectedAbortAfterShrink); } - uint64_t totalCapacity{0}; for (const auto& taskContainer : taskContainers) { - totalCapacity += taskContainer.task->capacity(); + ASSERT_EQ( + taskContainer.task->pool()->capacity(), + taskContainer.testTask.expectedCapacityAfterShrink); + ASSERT_EQ( + taskContainer.task->pool()->usedBytes(), + taskContainer.testTask.expectedUsagedAfterShrink); } - ASSERT_EQ( - arbitrator_->stats().freeCapacityBytes, testData.expectedFreeCapacity); - ASSERT_EQ( - arbitrator_->stats().freeReservedCapacityBytes, - testData.expectedReservedFreeCapacity); - ASSERT_EQ( - totalCapacity + arbitrator_->stats().freeCapacityBytes, - arbitrator_->capacity()); } } -// This test verifies local arbitration runs from the same query has to wait for +// This test verifies arbitration operations from the same query has to wait for // serial execution mode. -DEBUG_ONLY_TEST_F( - MockSharedArbitrationTest, - localArbitrationRunsFromSameQuery) { - const int64_t memoryCapacity = 512 << 20; - const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; - setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, localArbitrationsFromSameQuery) { + const int64_t memoryCapacity = 256 << 20; + setupMemory(memoryCapacity); auto runTask = addTask(memoryCapacity); auto* runPool = runTask->addMemoryOp(true); auto* waitPool = runTask->addMemoryOp(true); @@ -1121,10 +1202,12 @@ DEBUG_ONLY_TEST_F( std::atomic_bool localArbitrationWaitFlag{true}; folly::EventCount localArbitrationWait; SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + "facebook::velox::memory::SharedArbitrator::growCapacity", std::function( ([&](const SharedArbitrator* /*unused*/) { if (!allocationWaitFlag.exchange(false)) { + // Let the first allocation go through from 'runPool'. + std::this_thread::sleep_for(std::chrono::seconds(1)); // NOLINT return; } allocationWait.notifyAll(); @@ -1142,7 +1225,8 @@ DEBUG_ONLY_TEST_F( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 0); ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); ++allocationCount; }); @@ -1152,12 +1236,13 @@ DEBUG_ONLY_TEST_F( std::unordered_map runtimeStats; auto statsWriter = std::make_unique(runtimeStats); setThreadLocalRunTimeStatWriter(statsWriter.get()); - waitPool->allocate(memoryCapacity / 2); + waitPool->allocate(memoryCapacity / 2 + MB); ASSERT_EQ( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 0); ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 1); ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].sum, 1); ++allocationCount; @@ -1166,6 +1251,11 @@ DEBUG_ONLY_TEST_F( allocationWait.await([&]() { return !allocationWaitFlag.load(); }); std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT ASSERT_EQ(allocationCount, 0); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + test::ArbitrationParticipantTestHelper participantHelper( + arbitratorHelper.getParticipant(runTask->pool()->name()).get()); + ASSERT_TRUE(participantHelper.runningOp() != nullptr); + ASSERT_EQ(participantHelper.waitingOps().size(), 1); localArbitrationWaitFlag = false; localArbitrationWait.notifyAll(); @@ -1175,241 +1265,419 @@ DEBUG_ONLY_TEST_F( ASSERT_EQ(allocationCount, 2); } -// This test verifies local arbitration runs from different queries don't have -// to block waiting each other. +// This test verifies arbitration operations from different queris can run in +// parallel. DEBUG_ONLY_TEST_F( MockSharedArbitrationTest, - localArbitrationRunsFromDifferentQueries) { + localArbitrationsFromDifferentQueries) { const int64_t memoryCapacity = 512 << 20; - const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; - setupMemory( - memoryCapacity, - 0, - memoryPoolInitCapacity, - 0, - kFastExponentialGrowthCapacityLimit, - kSlowCapacityGrowPct, - 0, - 0); - auto runTask = addTask(memoryCapacity); - auto* runPool = runTask->addMemoryOp(true); - auto waitTask = addTask(memoryCapacity); - auto* waitPool = waitTask->addMemoryOp(true); + const uint64_t memoryPoolCapacity = memoryCapacity / 2; + setupMemory(memoryCapacity); - std::atomic_bool allocationWaitFlag{true}; - folly::EventCount allocationWait; - std::atomic_bool localArbitrationWaitFlag{true}; - folly::EventCount localArbitrationWait; + auto task1 = addTask(memoryPoolCapacity); + auto* op1 = task1->addMemoryOp(true); + op1->allocate(memoryPoolCapacity); + ASSERT_EQ(task1->capacity(), memoryPoolCapacity); + + auto task2 = addTask(memoryPoolCapacity); + auto* op2 = task2->addMemoryOp(true); + op2->allocate(memoryPoolCapacity); + ASSERT_EQ(task2->capacity(), memoryPoolCapacity); + + ASSERT_EQ(arbitrator_->stats().freeCapacityBytes, 0); + + std::atomic_bool reclaimWaitFlag{true}; + folly::EventCount reclaimWait; + std::atomic_int reclaimWaitCount{0}; SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + "facebook::velox::memory::ArbitrationParticipant::reclaim", std::function( ([&](const SharedArbitrator* /*unused*/) { - if (!allocationWaitFlag.exchange(false)) { - return; - } - allocationWait.notifyAll(); - localArbitrationWait.await( - [&]() { return !localArbitrationWaitFlag.load(); }); + ++reclaimWaitCount; + reclaimWait.await([&]() { return !reclaimWaitFlag.load(); }); }))); std::atomic_int allocationCount{0}; - auto runThread = std::thread([&]() { + auto taskThread1 = std::thread([&]() { std::unordered_map runtimeStats; auto statsWriter = std::make_unique(runtimeStats); setThreadLocalRunTimeStatWriter(statsWriter.get()); - runPool->allocate(memoryCapacity / 2); + op1->allocate(MB); + ASSERT_EQ(task1->capacity(), 8 * MB); ASSERT_EQ( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 1); ++allocationCount; }); - auto waitThread = std::thread([&]() { - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); + auto taskThread2 = std::thread([&]() { std::unordered_map runtimeStats; auto statsWriter = std::make_unique(runtimeStats); setThreadLocalRunTimeStatWriter(statsWriter.get()); - waitPool->allocate(memoryCapacity / 2); + op2->allocate(MB); + ASSERT_EQ(task2->capacity(), 8 * MB); ASSERT_EQ( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 1); ++allocationCount; }); - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - waitThread.join(); - ASSERT_EQ(allocationCount, 1); + while (reclaimWaitCount != 2) { + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // NOLINT + } + ASSERT_EQ(allocationCount, 0); - localArbitrationWaitFlag = false; - localArbitrationWait.notifyAll(); + reclaimWaitFlag = false; + reclaimWait.notifyAll(); - runThread.join(); + taskThread1.join(); + taskThread2.join(); ASSERT_EQ(allocationCount, 2); } -// This test verifies local arbitration runs can run in parallel with free -// memory reclamation. -DEBUG_ONLY_TEST_F( - MockSharedArbitrationTest, - localArbitrationRunsWithFreeMemoryReclamation) { - const int64_t memoryCapacity = 512 << 20; - const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; - setupMemory( - memoryCapacity, - 0, - memoryPoolInitCapacity, - 0, - kFastExponentialGrowthCapacityLimit, - kSlowCapacityGrowPct, - 0, - 0); - auto runTask = addTask(memoryCapacity); - auto* runPool = runTask->addMemoryOp(true); - auto waitTask = addTask(memoryCapacity); - auto* waitPool = waitTask->addMemoryOp(true); - auto reclaimedTask = addTask(memoryCapacity); - auto* reclaimedPool = reclaimedTask->addMemoryOp(true); - reclaimedPool->allocate(memoryCapacity / 4); - reclaimedPool->allocate(memoryCapacity / 4); - reclaimedPool->freeAll(); +// This test verifies the global arbitration can switch to reclaim the other +// query or abort when one query claims to be reclaimable but can't actually +// reclaim.h +TEST_F(MockSharedArbitrationTest, badNonReclaimableQuery) { + const int64_t memoryCapacity = 256 << 20; + const ReclaimInjectionCallback badReclaimInjectCallback = + [&](MemoryPool* pool, uint64_t /*unsed*/) -> bool { return false; }; - std::atomic_bool allocationWaitFlag{true}; - folly::EventCount allocationWait; - std::atomic_bool localArbitrationWaitFlag{true}; - folly::EventCount localArbitrationWait; - std::atomic_int allocationCount{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::runLocalArbitration", - std::function( - ([&](const SharedArbitrator* /*unused*/) { - if (!allocationWaitFlag.exchange(false)) { - return; - } - allocationWait.notifyAll(); - while (allocationCount != 1) { - std::this_thread::sleep_for( - std::chrono::milliseconds(200)); // NOLINT - } - }))); + struct TestTask { + bool reclaimable; + bool badQuery; + uint64_t allocateBytes{0}; - auto runThread = std::thread([&]() { - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - runPool->allocate(memoryCapacity / 2); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocationCount; - }); + uint64_t expectedCapacityAfterArbitration; + uint64_t expectedUsagedAfterArbitration; + bool expectedAbortAfterArbitration; - auto waitThread = std::thread([&]() { - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - waitPool->allocate(memoryCapacity / 2); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocationCount; - }); + std::string debugString() const { + return fmt::format( + "reclaimable: {}, badQuery: {}, allocateBytes: {}, expectedCapacityAfterArbitration: {}, expectedUsagedAfterArbitration: {}, expectedAbortAfterArbitration: {}", + reclaimable, + badQuery, + succinctBytes(allocateBytes), + succinctBytes(expectedCapacityAfterArbitration), + succinctBytes(expectedUsagedAfterArbitration), + expectedAbortAfterArbitration); + } + }; - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - waitThread.join(); - ASSERT_EQ(allocationCount, 1); - runThread.join(); - ASSERT_EQ(allocationCount, 2); + struct TestTaskContainer { + std::shared_ptr task; + MockMemoryOperator* op; + TestTask testTask; + }; + + struct { + std::vector testTasks; + + std::string debugString() const { + std::stringstream tasksOss; + for (const auto& testTask : testTasks) { + tasksOss << "["; + tasksOss << testTask.debugString(); + tasksOss << "], \n"; + } + return fmt::format("testTasks: \n{}", tasksOss.str()); + } + } testSettings[] = { + {{{true, + true, + memoryCapacity / 2, + memoryCapacity / 2, + memoryCapacity / 2, + false}, + {true, + false, + memoryCapacity / 4, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {true, false, memoryCapacity / 4, 0, 0, false}}}, + {{{true, + true, + memoryCapacity / 2, + memoryCapacity / 2, + memoryCapacity / 2, + false}, + {true, + true, + memoryCapacity / 4, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {true, + true, + memoryCapacity / 4 - memoryCapacity / 8, + memoryCapacity / 4 - memoryCapacity / 8, + memoryCapacity / 4 - memoryCapacity / 8, + false}, + {true, false, memoryCapacity / 8, 0, 0, false}}}, + {{ + {true, + true, + memoryCapacity / 2, + memoryCapacity / 2, + memoryCapacity / 2, + false}, + {false, + true, + memoryCapacity / 4, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + // The newest participant is chosen to abort. + {false, true, memoryCapacity / 4, 0, 0, true}, + }}, + {{ + {false, + true, + memoryCapacity / 4, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {false, + true, + memoryCapacity / 4, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + // The newest participant is chosen to abort. + {true, true, memoryCapacity / 2, 0, 0, true}, + }}, + {{ + {true, + true, + memoryCapacity / 2, + memoryCapacity / 2, + memoryCapacity / 2, + false}, + {true, + true, + memoryCapacity / 4, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + // The newest participant is chosen to abort. + {true, true, memoryCapacity / 4, 0, 0, true}, + }}, + }; + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + // Make simple settings to focus shrink capacity logic testing. + setupMemory(memoryCapacity); + std::vector taskContainers; + for (const auto& testTask : testData.testTasks) { + auto task = addTask(memoryCapacity); + auto* op = addMemoryOp( + task, + testTask.reclaimable, + testTask.badQuery ? badReclaimInjectCallback : nullptr); + ASSERT_EQ(op->capacity(), 0); + if (testTask.allocateBytes != 0) { + op->allocate(testTask.allocateBytes); + } + ASSERT_EQ(task->capacity(), testTask.allocateBytes); + ASSERT_LE(task->usedBytes(), testTask.allocateBytes); + taskContainers.push_back({task, op, testTask}); + } + auto arbitrationTriggerTask = addTask(memoryCapacity); + auto* arbitrationTriggerOp = addMemoryOp(arbitrationTriggerTask, false); + ASSERT_EQ(arbitrationTriggerTask->capacity(), 0); + arbitrationTriggerOp->allocate(MB); + ASSERT_EQ(arbitrationTriggerTask->capacity(), MB); + ASSERT_EQ(arbitrationTriggerTask->usedBytes(), MB); + + for (const auto& taskContainer : taskContainers) { + ASSERT_EQ( + taskContainer.task->pool()->capacity(), + taskContainer.testTask.expectedCapacityAfterArbitration); + ASSERT_EQ( + taskContainer.task->pool()->usedBytes(), + taskContainer.testTask.expectedUsagedAfterArbitration); + ASSERT_EQ( + taskContainer.task->pool()->aborted(), + taskContainer.testTask.expectedAbortAfterArbitration); + } + } } -// This test verifies local arbitration run can't reclaim free memory from -// memory pool which is also under memory arbitration. +// This test verifies memory pool can allocate reserve memory during global +// arbitration. DEBUG_ONLY_TEST_F( MockSharedArbitrationTest, - localArbitrationRunFreeMemoryReclamationCheck) { - const int64_t memoryCapacity = 512 << 20; - const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; - setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); - auto runTask = addTask(memoryCapacity); - auto* runPool = runTask->addMemoryOp(true); - runPool->allocate(memoryCapacity / 4); - runPool->allocate(memoryCapacity / 4); - auto waitTask = addTask(memoryCapacity); - auto* waitPool = waitTask->addMemoryOp(true); - waitPool->allocate(memoryCapacity / 4); + allocationFromFreeReservedMemoryDuringGlobalArbitration) { + const int64_t memoryCapacity = 256 << 20; + const uint64_t memoryPoolCapacity = 64 << 20; + const uint64_t memoryPoolReservedCapacity = 8 << 20; + const uint64_t reservedMemoryCapacity = 64 << 20; + setupMemory( + memoryCapacity, reservedMemoryCapacity, 0, memoryPoolReservedCapacity); - std::atomic_bool allocationWaitFlag{true}; - folly::EventCount allocationWait; - std::atomic_bool localArbitrationWaitFlag{true}; - folly::EventCount localArbitrationWait; - std::atomic_int allocationCount{0}; + auto globalArbitrationTriggerThread = std::thread([&]() { + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + + std::vector> tasks; + std::vector ops; + ops.reserve(4); + tasks.reserve(4); + for (int i = 0; i < 4; ++i) { + tasks.push_back(addTask(memoryPoolCapacity)); + ops.push_back(tasks.back()->addMemoryOp(true)); + } + for (int i = 0; i < 4; ++i) { + ops[i]->allocate(memoryPoolCapacity); + } + // We expect global arbitration has been triggered. + ASSERT_GE( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 1); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].sum, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].sum, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitWallNanos].count, + 1); + ASSERT_GT( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitWallNanos].sum, + 1'000'000'000); + }); + + std::atomic_bool globalArbitrationStarted{false}; + folly::EventCount globalArbitrationStartWait; + std::atomic_bool globalArbitrationWaitFlag{true}; + folly::EventCount globalArbitrationWait; SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::runLocalArbitration", + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", std::function( ([&](const SharedArbitrator* /*unused*/) { - if (!allocationWaitFlag.exchange(false)) { + if (globalArbitrationStarted.exchange(true)) { return; } - allocationWait.notifyAll(); + globalArbitrationStartWait.notifyAll(); - localArbitrationWait.await( - [&]() { return !localArbitrationWaitFlag.load(); }); + globalArbitrationWait.await( + [&]() { return !globalArbitrationWaitFlag.load(); }); }))); - auto runThread = std::thread([&]() { + globalArbitrationStartWait.await( + [&]() { return globalArbitrationStarted.load(); }); + + auto nonBlockingTask = addTask(memoryPoolCapacity); + auto* nonBlockingOp = nonBlockingTask->addMemoryOp(true); + nonBlockingOp->allocate(memoryPoolReservedCapacity); + // Inject some delay for global arbitration. + std::this_thread::sleep_for(std::chrono::seconds(1)); // NOLINT + globalArbitrationWaitFlag = false; + globalArbitrationWait.notifyAll(); + + globalArbitrationTriggerThread.join(); + ASSERT_EQ(nonBlockingTask->capacity(), memoryPoolReservedCapacity); +} + +DEBUG_ONLY_TEST_F( + MockSharedArbitrationTest, + localArbitrationRunInParallelWithGlobalArbitration) { + const int64_t memoryCapacity = 256 << 20; + const uint64_t reservedMemoryCapacity = 64 << 20; + const uint64_t memoryPoolCapacity = 64 << 20; + const uint64_t memoryPoolReservedCapacity = 8 << 20; + setupMemory( + memoryCapacity, reservedMemoryCapacity, 0, memoryPoolReservedCapacity); + + auto localArbitrationTask = addTask(memoryPoolCapacity); + auto* localArbitrationOp = localArbitrationTask->addMemoryOp(true); + localArbitrationOp->allocate(memoryPoolCapacity); + + auto globalArbitrationTriggerThread = std::thread([&]() { std::unordered_map runtimeStats; auto statsWriter = std::make_unique(runtimeStats); setThreadLocalRunTimeStatWriter(statsWriter.get()); - runPool->allocate(memoryCapacity / 4); - ASSERT_EQ( + + std::vector> tasks; + std::vector ops; + ops.reserve(3); + tasks.reserve(3); + for (int i = 0; i < 3; ++i) { + tasks.push_back(addTask(memoryPoolCapacity)); + ops.push_back(tasks.back()->addMemoryOp(true)); + } + for (int i = 0; i < 3; ++i) { + ops[i]->allocate(memoryPoolCapacity); + } + // We expect global arbitration has been triggered. + ASSERT_GE( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 2); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].sum, 2); ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocationCount; - }); - - auto waitThread = std::thread([&]() { - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - waitPool->allocate(memoryCapacity / 2); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].sum, 0); ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitWallNanos].count, + 2); ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocationCount; + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitWallNanos].sum, + 1'000'000'000); }); - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT - ASSERT_EQ(allocationCount, 0); + std::atomic_bool globalArbitrationStarted{false}; + folly::EventCount globalArbitrationStartWait; + std::atomic_bool globalArbitrationWaitFlag{true}; + folly::EventCount globalArbitrationWait; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + if (globalArbitrationStarted.exchange(true)) { + return; + } + globalArbitrationStartWait.notifyAll(); - localArbitrationWaitFlag = false; - localArbitrationWait.notifyAll(); + globalArbitrationWait.await( + [&]() { return !globalArbitrationWaitFlag.load(); }); + }))); - runThread.join(); - waitThread.join(); - ASSERT_EQ(allocationCount, 2); - ASSERT_EQ(runTask->capacity(), memoryCapacity / 4); - ASSERT_EQ(waitTask->capacity(), memoryCapacity / 4 + memoryCapacity / 2); + globalArbitrationStartWait.await( + [&]() { return globalArbitrationStarted.load(); }); + + std::unordered_map runtimeStats; + auto statsWriter = std::make_unique(runtimeStats); + setThreadLocalRunTimeStatWriter(statsWriter.get()); + + localArbitrationOp->allocate(memoryPoolReservedCapacity); + // Inject some delay for global arbitration. + std::this_thread::sleep_for(std::chrono::seconds(1)); // NOLINT + globalArbitrationWaitFlag = false; + globalArbitrationWait.notifyAll(); + + globalArbitrationTriggerThread.join(); + ASSERT_EQ(localArbitrationOp->capacity(), memoryPoolReservedCapacity); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].sum, 0); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 1); + ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].sum, 1); } DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, multipleGlobalRuns) { @@ -1451,8 +1719,10 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, multipleGlobalRuns) { runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 1); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].sum, 1); ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); ++allocations; }); @@ -1466,8 +1736,10 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, multipleGlobalRuns) { runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); ASSERT_GT( runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].count, 1); + ASSERT_EQ( + runtimeStats[SharedArbitrator::kGlobalArbitrationWaitCount].sum, 1); ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); ++allocations; }); @@ -1475,6 +1747,9 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, multipleGlobalRuns) { allocationWait.await([&]() { return !allocationWaitFlag.load(); }); std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT ASSERT_EQ(allocations, 0); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + ASSERT_EQ(arbitratorHelper.numGlobalArbitrationWaiters(), 2); + ASSERT_EQ(arbitrator_->stats().numRunning, 2); globalArbitrationWaitFlag = false; globalArbitrationWait.notifyAll(); @@ -1486,7 +1761,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, multipleGlobalRuns) { ASSERT_EQ(waitTask->capacity(), memoryCapacity / 2); } -DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, globalArbitrationEnableCheck) { +TEST_F(MockSharedArbitrationTest, globalArbitrationEnableCheck) { for (bool globalArbitrationEnabled : {false, true}) { SCOPED_TRACE( fmt::format("globalArbitrationEnabled: {}", globalArbitrationEnabled)); @@ -1501,9 +1776,21 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, globalArbitrationEnableCheck) { kSlowCapacityGrowPct, kMemoryPoolMinFreeCapacity, kMemoryPoolMinFreeCapacityPct, + 0, + 0, + kGlobalArbitrationReclaimPct, + kGlobalArbitrationReclaimThreadsHwMultiplier, nullptr, globalArbitrationEnabled); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + ASSERT_EQ( + arbitratorHelper.globalArbitrationController() != nullptr, + globalArbitrationEnabled); + ASSERT_EQ( + arbitratorHelper.globalArbitrationExecutor() != nullptr, + globalArbitrationEnabled); + auto reclaimedTask = addTask(memoryCapacity); auto* reclaimedPool = reclaimedTask->addMemoryOp(true); reclaimedPool->allocate(memoryCapacity / 2); @@ -1520,237 +1807,75 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, globalArbitrationEnableCheck) { } } -// This test verifies when a global arbitration is running, the local -// arbitration run has to wait for the current running global arbitration run -// to complete. -DEBUG_ONLY_TEST_F( - MockSharedArbitrationTest, - localArbitrationWaitForGlobalArbitration) { - const int64_t memoryCapacity = 512 << 20; - const uint64_t memoryPoolInitCapacity = memoryCapacity / 2; - setupMemory( - memoryCapacity, - 0, - memoryPoolInitCapacity, - 0, - kFastExponentialGrowthCapacityLimit, - kSlowCapacityGrowPct, - 0, - 0); - auto runTask = addTask(memoryCapacity); - auto* runPool = runTask->addMemoryOp(true); - runPool->allocate(memoryCapacity / 2); - auto waitTask = addTask(memoryCapacity); - auto* waitPool = waitTask->addMemoryOp(true); - waitPool->allocate(memoryCapacity / 4); +TEST_F(MockSharedArbitrationTest, singlePoolShrinkWithoutArbitration) { + const int64_t memoryCapacity = 512 * MB; + struct TestParam { + uint64_t memoryPoolReservedBytes; + uint64_t memoryPoolMinFreeCapacity; + double memoryPoolMinFreeCapacityPct; + uint64_t requestBytes; + bool expectThrow; + uint64_t expectedCapacity; + std::string debugString() const { + return fmt::format( + "memoryPoolReservedBytes {}, " + "memoryPoolMinFreeCapacity {}, " + "memoryPoolMinFreeCapacityPct {}, " + "requestBytes {}, expectThrow {}, expectedCapacity, {}", + succinctBytes(memoryPoolReservedBytes), + succinctBytes(memoryPoolMinFreeCapacity), + memoryPoolMinFreeCapacityPct, + succinctBytes(requestBytes), + expectThrow, + succinctBytes(expectedCapacity)); + } + } testParams[] = { + {0, 128 * MB, 0, 256 * MB, true, 0}, + {0, 0, 0.1, 256 * MB, true, 0}, + {256 * MB, 128 * MB, 0.5, 256 * MB, false, 256 * MB}, + {256 * MB, 128 * MB, 0.125, 256 * MB, false, 256 * MB}, + {0, 128 * MB, 0.25, 0 * MB, false, 0}, + {256 * MB, 128 * MB, 0.125, 0 * MB, false, 256 * MB}, + {256 * MB, 128 * MB, 0.125, 512 * MB, false, 256 * MB}}; - std::atomic_bool allocationWaitFlag{true}; - folly::EventCount allocationWait; + for (const auto& testParam : testParams) { + SCOPED_TRACE(testParam.debugString()); + if (testParam.expectThrow) { + VELOX_ASSERT_THROW( + setupMemory( + memoryCapacity, + 0, + memoryCapacity, + 0, + 0, + 0, + testParam.memoryPoolMinFreeCapacity, + testParam.memoryPoolMinFreeCapacityPct), + "both need to be set (non-zero) at the same time to enable shrink " + "capacity adjustment."); + continue; + } else { + setupMemory( + memoryCapacity, + 0, + memoryCapacity, + 0, + 0, + 0, + testParam.memoryPoolMinFreeCapacity, + testParam.memoryPoolMinFreeCapacityPct); + } - std::atomic_bool globalArbitrationWaitFlag{true}; - folly::EventCount globalArbitrationWait; - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", - std::function( - ([&](const SharedArbitrator* /*unused*/) { - if (!allocationWaitFlag.exchange(false)) { - return; - } - allocationWait.notifyAll(); - globalArbitrationWait.await( - [&]() { return !globalArbitrationWaitFlag.load(); }); - }))); + auto task = addTask(); + auto* memOp = task->addMemoryOp(); + memOp->allocate(testParam.memoryPoolReservedBytes); - std::atomic_int allocations{0}; - auto waitThread = std::thread([&]() { - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - waitPool->allocate(memoryCapacity / 4); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocations; - }); - - auto runThread = std::thread([&]() { - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - runPool->allocate(memoryCapacity / 2); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].sum, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocations; - }); - - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT - ASSERT_EQ(allocations, 0); - - globalArbitrationWaitFlag = false; - globalArbitrationWait.notifyAll(); - - runThread.join(); - waitThread.join(); - ASSERT_EQ(allocations, 2); - ASSERT_EQ(runTask->capacity(), memoryCapacity / 2); - ASSERT_EQ(waitTask->capacity(), memoryCapacity / 2); -} - -// This test verifies when a local arbitration is running, the global -// arbitration run have to wait for the current running global arbitration run -// to complete. -DEBUG_ONLY_TEST_F( - MockSharedArbitrationTest, - globalArbitrationWaitForLocalArbitration) { - const int64_t memoryCapacity = 512 << 20; - const uint64_t memoryPoolInitCapacity = memoryCapacity / 4; - setupMemory(memoryCapacity, 0, memoryPoolInitCapacity, 0); - auto runTask = addTask(memoryCapacity / 2); - auto* runPool = runTask->addMemoryOp(true); - runPool->allocate(memoryCapacity / 4); - auto waitTask = addTask(memoryCapacity); - auto* waitPool = waitTask->addMemoryOp(true); - waitPool->allocate(memoryCapacity / 4); - waitPool->allocate(memoryCapacity / 4); - - std::atomic_bool allocationWaitFlag{true}; - folly::EventCount allocationWait; - std::atomic_bool localArbitrationWaitFlag{true}; - folly::EventCount localArbitrationWait; - std::atomic_int allocationCount{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::runLocalArbitration", - std::function( - ([&](const SharedArbitrator* /*unused*/) { - if (!allocationWaitFlag.exchange(false)) { - return; - } - allocationWait.notifyAll(); - - localArbitrationWait.await( - [&]() { return !localArbitrationWaitFlag.load(); }); - }))); - - auto runThread = std::thread([&]() { - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - runPool->allocate(memoryCapacity / 4); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocationCount; - }); - - auto waitThread = std::thread([&]() { - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::unordered_map runtimeStats; - auto statsWriter = std::make_unique(runtimeStats); - setThreadLocalRunTimeStatWriter(statsWriter.get()); - waitPool->allocate(memoryCapacity / 2); - ASSERT_EQ( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].count, 1); - ASSERT_GT( - runtimeStats[SharedArbitrator::kMemoryArbitrationWallNanos].sum, 0); - ASSERT_EQ(runtimeStats[SharedArbitrator::kGlobalArbitrationCount].count, 1); - ASSERT_EQ(runtimeStats[SharedArbitrator::kLocalArbitrationCount].count, 0); - ++allocationCount; - }); - - allocationWait.await([&]() { return !allocationWaitFlag.load(); }); - std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT - ASSERT_EQ(allocationCount, 0); - - localArbitrationWaitFlag = false; - localArbitrationWait.notifyAll(); - - runThread.join(); - waitThread.join(); - ASSERT_EQ(allocationCount, 2); - ASSERT_EQ(runTask->capacity(), memoryCapacity / 2); - ASSERT_EQ(waitTask->capacity(), memoryCapacity / 2); -} - -TEST_F(MockSharedArbitrationTest, singlePoolShrinkWithoutArbitration) { - const int64_t memoryCapacity = 512 * MB; - struct TestParam { - uint64_t memoryPoolReservedBytes; - uint64_t memoryPoolMinFreeCapacity; - double memoryPoolMinFreeCapacityPct; - uint64_t requestBytes; - bool expectThrow; - uint64_t expectedCapacity; - std::string debugString() const { - return fmt::format( - "memoryPoolReservedBytes {}, " - "memoryPoolMinFreeCapacity {}, " - "memoryPoolMinFreeCapacityPct {}, " - "requestBytes {}, ", - succinctBytes(memoryPoolReservedBytes), - succinctBytes(memoryPoolMinFreeCapacity), - memoryPoolMinFreeCapacityPct, - succinctBytes(requestBytes)); - } - } testParams[] = { - {0, 128 * MB, 0, 256 * MB, true, 0}, - {0, 0, 0.1, 256 * MB, true, 0}, - {256 * MB, 128 * MB, 0.5, 256 * MB, false, 384 * MB}, - {256 * MB, 128 * MB, 0.125, 256 * MB, false, 320 * MB}, - {0, 128 * MB, 0.25, 0 * MB, false, 0}, - {256 * MB, 128 * MB, 0.125, 0 * MB, false, 256 * MB}, - {256 * MB, 128 * MB, 0.125, 512 * MB, false, 320 * MB}}; - - for (const auto& testParam : testParams) { - SCOPED_TRACE(testParam.debugString()); - if (testParam.expectThrow) { - VELOX_ASSERT_THROW( - setupMemory( - memoryCapacity, - 0, - memoryCapacity, - 0, - 0, - 0, - testParam.memoryPoolMinFreeCapacity, - testParam.memoryPoolMinFreeCapacityPct), - "both need to be set (non-zero) at the same time to enable shrink " - "capacity adjustment."); - continue; - } else { - setupMemory( - memoryCapacity, - 0, - memoryCapacity, - 0, - 0, - 0, - testParam.memoryPoolMinFreeCapacity, - testParam.memoryPoolMinFreeCapacityPct); - } - - auto* memOp = addMemoryOp(); - memOp->allocate(testParam.memoryPoolReservedBytes); - - ASSERT_EQ( - memOp->pool()->reservedBytes(), testParam.memoryPoolReservedBytes); - arbitrator_->shrinkCapacity(memOp->pool(), testParam.requestBytes); - ASSERT_EQ(memOp->pool()->capacity(), testParam.expectedCapacity); - clearTasks(); - } -} + ASSERT_EQ(task->pool()->reservedBytes(), testParam.memoryPoolReservedBytes); + arbitrator_->shrinkCapacity(task->pool(), testParam.requestBytes); + ASSERT_EQ(task->capacity(), testParam.expectedCapacity); + clearTasks(); + } +} TEST_F(MockSharedArbitrationTest, singlePoolGrowWithoutArbitration) { const int64_t memoryCapacity = 512 << 20; @@ -1825,7 +1950,6 @@ TEST_F(MockSharedArbitrationTest, singlePoolGrowWithoutArbitration) { TEST_F(MockSharedArbitrationTest, maxCapacityReserve) { const int memCapacity = 256 * MB; - const int minPoolCapacity = 32 * MB; struct { uint64_t memCapacity; uint64_t reservedCapacity; @@ -1833,26 +1957,28 @@ TEST_F(MockSharedArbitrationTest, maxCapacityReserve) { uint64_t poolReservedCapacity; uint64_t poolMaxCapacity; uint64_t expectedPoolInitCapacity; + bool expectedError; std::string debugString() const { return fmt::format( - "memCapacity {}, reservedCapacity {}, poolInitCapacity {}, poolReservedCapacity {}, poolMaxCapacity {}, expectedPoolInitCapacity {}", + "memCapacity {}, reservedCapacity {}, poolInitCapacity {}, poolReservedCapacity {}, poolMaxCapacity {}, expectedPoolInitCapacity {}, expectedError {}", succinctBytes(memCapacity), succinctBytes(reservedCapacity), succinctBytes(poolInitCapacity), succinctBytes(poolReservedCapacity), succinctBytes(poolMaxCapacity), - succinctBytes(expectedPoolInitCapacity)); + succinctBytes(expectedPoolInitCapacity), + expectedError); } } testSettings[] = { - {256 << 20, 256 << 20, 128 << 20, 64 << 20, 256 << 20, 64 << 20}, - {256 << 20, 0, 128 << 20, 64 << 20, 256 << 20, 128 << 20}, - {256 << 20, 0, 512 << 20, 64 << 20, 256 << 20, 256 << 20}, - {256 << 20, 0, 128 << 20, 64 << 20, 256 << 20, 128 << 20}, - {256 << 20, 128 << 20, 128 << 20, 64 << 20, 256 << 20, 128 << 20}, - {256 << 20, 128 << 20, 256 << 20, 64 << 20, 256 << 20, 128 << 20}, - {256 << 20, 128 << 20, 256 << 20, 256 << 20, 256 << 20, 256 << 20}, - {256 << 20, 128 << 20, 256 << 20, 256 << 20, 128 << 20, 128 << 20}}; + {256 << 20, 256 << 20, 128 << 20, 64 << 20, 256 << 20, 64 << 20, false}, + {256 << 20, 0, 128 << 20, 64 << 20, 256 << 20, 128 << 20, false}, + {256 << 20, 0, 512 << 20, 64 << 20, 256 << 20, 256 << 20, false}, + {256 << 20, 0, 128 << 20, 64 << 20, 256 << 20, 128 << 20, false}, + {256 << 20, 128 << 20, 128 << 20, 64 << 20, 256 << 20, 128 << 20, false}, + {256 << 20, 128 << 20, 256 << 20, 64 << 20, 256 << 20, 128 << 20, false}, + {256 << 20, 128 << 20, 256 << 20, 256 << 20, 256 << 20, 256 << 20, false}, + {256 << 20, 128 << 20, 256 << 20, 256 << 20, 128 << 20, 128 << 20, true}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); @@ -1861,6 +1987,11 @@ TEST_F(MockSharedArbitrationTest, maxCapacityReserve) { testData.reservedCapacity, testData.poolInitCapacity, testData.poolReservedCapacity); + if (testData.expectedError) { + VELOX_ASSERT_THROW(addTask(testData.poolMaxCapacity), ""); + continue; + } + auto task = addTask(testData.poolMaxCapacity); ASSERT_EQ(task->pool()->maxCapacity(), testData.poolMaxCapacity); ASSERT_EQ(task->pool()->capacity(), testData.expectedPoolInitCapacity); @@ -2024,80 +2155,746 @@ TEST_F(MockSharedArbitrationTest, ensureMemoryPoolMaxCapacity) { } else if (testData.hasOtherTask) { ASSERT_EQ(otherOp->reclaimer()->stats().numReclaims, 0); } - if (testData.expectedSuccess && - (((testData.allocatedBytes + testData.requestBytes) > - testData.poolMaxCapacity) || - testData.hasOtherTask)) { - ASSERT_GT(arbitrator_->stats().reclaimedUsedBytes, 0); - } else { - ASSERT_EQ(arbitrator_->stats().reclaimedUsedBytes, 0); + if (testData.expectedSuccess && + (((testData.allocatedBytes + testData.requestBytes) > + testData.poolMaxCapacity) || + testData.hasOtherTask)) { + ASSERT_GT(arbitrator_->stats().reclaimedUsedBytes, 0); + } else { + ASSERT_EQ(arbitrator_->stats().reclaimedUsedBytes, 0); + } + ASSERT_EQ(arbitrator_->stats().numRequests, numRequests + 1); + } +} + +TEST_F(MockSharedArbitrationTest, ensureNodeMaxCapacity) { + struct { + uint64_t nodeCapacity; + uint64_t poolMaxCapacity; + bool isReclaimable; + uint64_t allocatedBytes; + uint64_t requestBytes; + bool expectedSuccess; + bool expectedReclaimedBytes; + + std::string debugString() const { + return fmt::format( + "nodeCapacity {} poolMaxCapacity {} isReclaimable {} " + "allocatedBytes {} requestBytes {} expectedSuccess {} " + "expectedReclaimedBytes {}", + succinctBytes(nodeCapacity), + succinctBytes(poolMaxCapacity), + isReclaimable, + succinctBytes(allocatedBytes), + succinctBytes(requestBytes), + expectedSuccess, + expectedReclaimedBytes); + } + } testSettings[] = { + {256 * MB, 256 * MB, true, 128 * MB, 256 * MB, true, true}, + {256 * MB, 256 * MB, false, 128 * MB, 256 * MB, false, false}, + {256 * MB, 512 * MB, true, 128 * MB, 256 * MB, true, true}, + {256 * MB, 512 * MB, false, 128 * MB, 256 * MB, false, false}, + {256 * MB, 128 * MB, false, 128 * MB, 256 * MB, false, false}, + {256 * MB, 128 * MB, true, 128 * MB, 256 * MB, false, false}, + {256 * MB, 128 * MB, true, 128 * MB, 512 * MB, false, false}, + {256 * MB, 128 * MB, false, 128 * MB, 512 * MB, false, false}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + setupMemory(testData.nodeCapacity, 0, 0, 0); + + auto requestor = addTask(testData.poolMaxCapacity); + auto* requestorOp = addMemoryOp(requestor, testData.isReclaimable); + requestorOp->allocate(testData.allocatedBytes); + const auto numRequests = arbitrator_->stats().numRequests; + if (testData.expectedSuccess) { + requestorOp->allocate(testData.requestBytes); + } else { + VELOX_ASSERT_THROW( + requestorOp->allocate(testData.requestBytes), + "Exceeded memory pool cap"); + } + if (testData.expectedSuccess) { + ASSERT_GT(arbitrator_->stats().reclaimedUsedBytes, 0); + } else { + ASSERT_EQ(arbitrator_->stats().reclaimedUsedBytes, 0); + } + ASSERT_EQ(arbitrator_->stats().numRequests, numRequests + 1); + } +} + +TEST_F(MockSharedArbitrationTest, arbitrationAbort) { + uint64_t memoryCapacity = 256 * MB; + setupMemory( + memoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, nullptr, true, 1'000); + std::shared_ptr task1 = addTask(memoryCapacity); + auto* op1 = + task1->addMemoryOp(true, [&](MemoryPool* /*unsed*/, uint64_t /*unsed*/) { + VELOX_FAIL("throw reclaim exception"); + return false; + }); + op1->allocate(memoryCapacity / 2); + ASSERT_EQ(task1->capacity(), memoryCapacity / 2); + + std::shared_ptr task2 = addTask(memoryCapacity); + auto* op2 = task2->addMemoryOp(true); + op2->allocate(memoryCapacity / 4); + ASSERT_EQ(task2->capacity(), memoryCapacity / 4); + + std::shared_ptr task3 = addTask(memoryCapacity); + auto* op3 = task3->addMemoryOp(true); + op3->allocate(memoryCapacity / 4); + ASSERT_EQ(task3->capacity(), memoryCapacity / 4); + + folly::EventCount globalArbitrationWait; + std::atomic_bool globalArbitrationWaitFlag{true}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", + std::function( + ([&](const SharedArbitrator* arbitrator) { + test::SharedArbitratorTestHelper arbitratorHelper( + const_cast(arbitrator)); + ASSERT_EQ(arbitratorHelper.numGlobalArbitrationWaiters(), 1); + }))); + try { + op1->allocate(memoryCapacity / 4); + } catch (const VeloxException& ex) { + ASSERT_EQ(ex.errorCode(), error_code::kMemAborted); + ASSERT_THAT(ex.what(), testing::HasSubstr("aborted")); + } + + // Task1 has been aborted, + ASSERT_EQ(task1->capacity(), 0); + ASSERT_TRUE(task1->pool()->aborted()); + auto arbitratorHelper = test::SharedArbitratorTestHelper(arbitrator_); + ASSERT_TRUE( + arbitratorHelper.getParticipant(task1->pool()->name())->aborted()); + ASSERT_EQ(task2->capacity(), memoryCapacity / 4); + ASSERT_EQ(task3->capacity(), memoryCapacity / 4); +} + +TEST_F(MockSharedArbitrationTest, memoryPoolAbortCapacityLimit) { + const int64_t memoryCapacity = 256 << 20; + + struct TestTask { + uint64_t capacity; + bool expectedAbort{false}; + + std::string debugString() const { + return fmt::format( + "capacity: {}, expectedAbort: {}", + succinctBytes(capacity), + expectedAbort); + } + }; + + struct { + std::vector testTasks; + uint64_t memoryPoolAbortCapacityLimit; + uint64_t targetBytes; + uint64_t expectedReclaimedUsedBytes; + + std::string debugString() const { + std::stringstream tasksOss; + for (const auto& testTask : testTasks) { + tasksOss << "["; + tasksOss << testTask.debugString(); + tasksOss << "], \n"; + } + return fmt::format( + "testTasks: \n[{}]\nmemoryPoolAbortCapacityLimit: {}, targetBytes: {}, expectedReclaimedUsedBytes: {}", + tasksOss.str(), + succinctBytes(memoryPoolAbortCapacityLimit), + succinctBytes(targetBytes), + succinctBytes(expectedReclaimedUsedBytes)); + } + } testSettings[] = { + {{{64 << 20, false}, + {128 << 20, false}, + // Young participant is chosen to abort first with the same bucket. + {64 << 20, true}}, + 64 << 20, + 32 << 20, + 64 << 20}, + {{{64 << 20, false}, {128 << 20, true}, {32 << 20, false}}, + 64 << 20, + 32 << 20, + 128 << 20}, + {{{128 << 20, false}, {64 << 20, true}, {32 << 20, false}}, + 64 << 20, + 32 << 20, + 64 << 20}, + {{{128 << 20, true}, {64 << 20, true}, {32 << 20, false}}, + 64 << 20, + 128 << 20, + 192 << 20}, + {{{32 << 20, true}, {0, false}}, 64 << 20, 128 << 20, 32 << 20}, + {{{0, false}, {0, false}}, 64 << 20, 128 << 20, 0}, + {{{128 << 20, false}, {64 << 20, false}, {32 << 20, true}}, + 32 << 20, + 16 << 20, + 32 << 20}, + {{{64 << 20, true}, + {16 << 20, false}, + {32 << 20, true}, + {32 << 20, true}}, + 64 << 20, + 128 << 20, + 128 << 20}, + {{{8 << 20, true}, + {16 << 20, true}, + {7 << 20, true}, + {32 << 20, true}, + {128 << 20, true}}, + 64 << 20, + 0, + 191 << 20}}; + + struct TestTaskContainer { + std::shared_ptr task; + MockMemoryOperator* op; + TestTask testTask; + }; + + std::function checkTaskException = + [](MockTask* task, bool expectedAbort) { + if (!expectedAbort) { + ASSERT_EQ(task->error(), nullptr); + return; + } + ASSERT_NE(task->error(), nullptr); + VELOX_ASSERT_THROW( + std::rethrow_exception(task->error()), + "Memory pool aborted to reclaim used memory, current capacity"); + }; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + setupMemory( + memoryCapacity, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + testData.memoryPoolAbortCapacityLimit); + + std::vector taskContainers; + for (const auto& testTask : testData.testTasks) { + auto task = addTask(); + auto* op = addMemoryOp(task, true); + ASSERT_EQ(op->capacity(), 0); + if (testTask.capacity != 0) { + op->allocate(testTask.capacity); + } + ASSERT_EQ(task->capacity(), testTask.capacity); + ASSERT_LE(task->usedBytes(), testTask.capacity); + taskContainers.push_back({task, op, testTask}); + } + + ASSERT_EQ( + manager_->shrinkPools(testData.targetBytes, false, true), + testData.expectedReclaimedUsedBytes); + ASSERT_EQ( + arbitrator_->stats().reclaimedUsedBytes, + testData.expectedReclaimedUsedBytes); + + for (const auto& taskContainer : taskContainers) { + checkTaskException( + taskContainer.task.get(), taskContainer.testTask.expectedAbort); + } + } +} + +TEST_F( + MockSharedArbitrationTest, + globalArbitrationWaitReturnEarlyWithFreeCapacity) { + uint64_t memoryCapacity = 256 * MB; + setupMemory(memoryCapacity); + std::shared_ptr task1 = addTask(memoryCapacity); + auto* op1 = task1->addMemoryOp(true); + op1->allocate(memoryCapacity / 2); + ASSERT_EQ(task1->capacity(), memoryCapacity / 2); + + std::shared_ptr task2 = addTask(memoryCapacity); + auto* op2 = task2->addMemoryOp(true); + op2->allocate(memoryCapacity / 2); + ASSERT_EQ(task2->capacity(), memoryCapacity / 2); + + folly::EventCount globalArbitrationStarted; + std::atomic_bool globalArbitrationStartedFlag{false}; + folly::EventCount globalArbitrationWait; + std::atomic_bool globalArbitrationWaitFlag{true}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", + std::function( + ([&](const SharedArbitrator* arbitrator) { + test::SharedArbitratorTestHelper arbitratorHelper( + const_cast(arbitrator)); + ASSERT_EQ(arbitratorHelper.numGlobalArbitrationWaiters(), 1); + globalArbitrationStartedFlag = true; + globalArbitrationStarted.notifyAll(); + globalArbitrationWait.await( + [&]() { return !globalArbitrationWaitFlag.load(); }); + }))); + std::thread allocationThread([&]() { op1->allocate(memoryCapacity / 4); }); + globalArbitrationStarted.await( + [&]() { return globalArbitrationStartedFlag.load(); }); + + op2->freeAll(); + task2.reset(); + allocationThread.join(); + + ASSERT_EQ(task1->capacity(), memoryCapacity / 2 + memoryCapacity / 4); + test::SharedArbitratorTestHelper arbitratorHelper( + const_cast(arbitrator_)); + ASSERT_TRUE(arbitratorHelper.globalArbitrationRunning()); + + globalArbitrationWaitFlag = false; + globalArbitrationWait.notifyAll(); + + ASSERT_EQ( + arbitratorHelper.getParticipant(task1->pool()->name()) + ->stats() + .numReclaims, + 0); + arbitratorHelper.waitForGlobalArbitrationToFinish(); +} + +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, globalArbitrationTimeout) { + uint64_t memoryCapacity = 256 * MB; + setupMemory( + memoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, nullptr, true, 1'000); + std::shared_ptr task1 = addTask(memoryCapacity); + auto* op1 = task1->addMemoryOp(true); + op1->allocate(memoryCapacity / 2); + ASSERT_EQ(task1->capacity(), memoryCapacity / 2); + + std::shared_ptr task2 = addTask(memoryCapacity); + auto* op2 = task2->addMemoryOp(true); + ASSERT_EQ(task2->capacity(), 0); + + folly::EventCount globalArbitrationWait; + std::atomic_bool globalArbitrationWaitFlag{true}; + std::atomic_bool globalArbitrationExecuted{false}; + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::runGlobalArbitration", + std::function( + ([&](const SharedArbitrator* /*unused*/) { + globalArbitrationWait.await( + [&]() { return !globalArbitrationWaitFlag.load(); }); + globalArbitrationExecuted = true; + }))); + try { + op2->allocate(memoryCapacity / 2 + memoryCapacity / 4); + } catch (const VeloxException& ex) { + ASSERT_EQ(ex.errorCode(), error_code::kMemArbitrationTimeout); + ASSERT_THAT( + ex.what(), + testing::HasSubstr("Memory arbitration timed out on memory pool")); + } + globalArbitrationWaitFlag = false; + globalArbitrationWait.notifyAll(); + + // Nothing needs to reclaim as the arbitration has timed out. + ASSERT_EQ(task1->capacity(), memoryCapacity / 2); + ASSERT_EQ(task2->capacity(), 0); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + arbitratorHelper.waitForGlobalArbitrationToFinish(); + ASSERT_TRUE(globalArbitrationExecuted); +} + +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, localArbitrationTimeout) { + uint64_t memoryCapacity = 256 * MB; + setupMemory( + memoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, nullptr, true, 1'000); + std::shared_ptr task = addTask(memoryCapacity); + ASSERT_EQ(task->capacity(), 0); + auto* op = task->addMemoryOp(true); + op->allocate(memoryCapacity / 2); + + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::ArbitrationParticipant::reclaim", + std::function( + ([&](const ArbitrationParticipant* /*unused*/) { + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + }))); + try { + op->allocate(memoryCapacity); + } catch (const VeloxException& ex) { + ASSERT_EQ(ex.errorCode(), error_code::kMemArbitrationTimeout); + ASSERT_THAT( + ex.what(), + testing::HasSubstr("Memory arbitration timed out on memory pool")); + } + + // Reclaim happened before timeout check. + ASSERT_EQ(task->capacity(), 0); +} + +DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, localArbitrationQueueTimeout) { + uint64_t memoryCapacity = 256 * MB; + setupMemory( + memoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, nullptr, true, 1'000); + std::shared_ptr task = addTask(memoryCapacity); + ASSERT_EQ(task->capacity(), 0); + auto* op = task->addMemoryOp(true); + + SCOPED_TESTVALUE_SET( + "facebook::velox::memory::SharedArbitrator::growCapacity", + std::function( + ([&](const SharedArbitrator* arbitrator) { + test::SharedArbitratorTestHelper arbitratorHelper( + const_cast(arbitrator)); + ASSERT_EQ(arbitratorHelper.maxArbitrationTimeMs(), 1'000); + std::this_thread::sleep_for(std::chrono::seconds(2)); // NOLINT + }))); + try { + op->allocate(memoryCapacity); + } catch (const VeloxException& ex) { + ASSERT_EQ(ex.errorCode(), error_code::kMemArbitrationTimeout); + ASSERT_THAT( + ex.what(), + testing::HasSubstr("Memory arbitration timed out on memory pool")); + } + + // Nothing needs to reclaim as the arbitration has timed out. + ASSERT_EQ(task->capacity(), 0); +} + +TEST_F(MockSharedArbitrationTest, minReclaimBytes) { + const int64_t memoryCapacity = 256 << 20; + + struct TestTask { + uint64_t capacity{0}; + bool reclaimable{false}; + + uint64_t expectedCapacityAfterReclaim; + uint64_t expectedUsagedAfterReclaim; + bool expectedAbortAfterReclaim{false}; + + std::string debugString() const { + return fmt::format( + "capacity: {}, expectedCapacityAfterReclaim: {}, expectedUsagedAfterReclaim: {}, expectedAbortAfterReclaim: {}", + succinctBytes(capacity), + succinctBytes(expectedCapacityAfterReclaim), + succinctBytes(expectedUsagedAfterReclaim), + expectedAbortAfterReclaim); + } + }; + + struct { + std::vector testTasks; + uint64_t minReclaimBytes; + uint64_t targetBytes; + + std::string debugString() const { + std::stringstream tasksOss; + for (const auto& testTask : testTasks) { + tasksOss << "["; + tasksOss << testTask.debugString(); + tasksOss << "], \n"; + } + return fmt::format( + "testTasks: \n[{}]\ntargetBytes: {}", + tasksOss.str(), + succinctBytes(minReclaimBytes), + succinctBytes(targetBytes)); + } + } testSettings[] = { + {{{memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 2, true, 0, 0, false}, + {memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}}, + memoryCapacity / 4, + MB}, + + {{{memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 2, true, 0, 0, false}, + {memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}}, + memoryCapacity / 2, + MB}, + + {{{memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, true, 0, 0, true}}, + memoryCapacity / 2, + MB}, + + {{{memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, + true, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, true, 0, 0, true}, + {memoryCapacity / 4, true, 0, 0, true}}, + memoryCapacity / 2, + memoryCapacity / 2}, + + {{{memoryCapacity / 4, + false, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, + false, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, false, 0, 0, true}, + {memoryCapacity / 4, false, 0, 0, true}}, + memoryCapacity / 8, + memoryCapacity / 2}, + + {{{memoryCapacity / 4, + false, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, + false, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, + false, + memoryCapacity / 4, + memoryCapacity / 4, + false}, + {memoryCapacity / 4, false, 0, 0, true}}, + memoryCapacity / 8, + MB}}; + + struct TestTaskContainer { + std::shared_ptr task; + MockMemoryOperator* op; + TestTask testTask; + }; + + std::function checkTaskException = + [](MockTask* task, bool expectedAbort) { + if (!expectedAbort) { + ASSERT_EQ(task->error(), nullptr); + return; + } + ASSERT_NE(task->error(), nullptr); + VELOX_ASSERT_THROW( + std::rethrow_exception(task->error()), + "Memory pool aborted to reclaim used memory, current capacity"); + }; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + + // Make simple settings to focus shrink capacity logic testing. + setupMemory(memoryCapacity, 0, 0, 0, 0, 0, 0, 0, testData.minReclaimBytes); + std::vector taskContainers; + for (const auto& testTask : testData.testTasks) { + auto task = addTask(); + auto* op = addMemoryOp(task, testTask.reclaimable); + ASSERT_EQ(op->capacity(), 0); + if (testTask.capacity != 0) { + op->allocate(testTask.capacity); + } + ASSERT_EQ(task->capacity(), testTask.capacity); + ASSERT_LE(task->usedBytes(), testTask.capacity); + taskContainers.push_back({task, op, testTask}); + } + + auto arbitrationTask = addTask(); + auto* arbitrationOp = arbitrationTask->addMemoryOp(true); + arbitrationOp->allocate(testData.targetBytes); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + arbitratorHelper.waitForGlobalArbitrationToFinish(); + + for (const auto& taskContainer : taskContainers) { + checkTaskException( + taskContainer.task.get(), + taskContainer.testTask.expectedAbortAfterReclaim); + } + + for (const auto& taskContainer : taskContainers) { + ASSERT_EQ( + taskContainer.task->pool()->capacity(), + taskContainer.testTask.expectedCapacityAfterReclaim); + ASSERT_EQ( + taskContainer.task->pool()->usedBytes(), + taskContainer.testTask.expectedCapacityAfterReclaim); } - ASSERT_EQ(arbitrator_->stats().numRequests, numRequests + 1); } } -TEST_F(MockSharedArbitrationTest, ensureNodeMaxCapacity) { +TEST_F(MockSharedArbitrationTest, globalArbitrationReclaimPct) { + const int64_t memoryCapacity = 256 << 20; + const int64_t memoryPoolCapacity = 64 << 20; + + struct TestTask { + uint64_t capacity{0}; + + uint64_t expectedCapacityAfterReclaim; + uint64_t expectedUsagedAfterReclaim; + + std::string debugString() const { + return fmt::format( + "capacity: {}, expectedCapacityAfterReclaim: {}, expectedUsagedAfterReclaim: {}", + succinctBytes(capacity), + succinctBytes(expectedCapacityAfterReclaim), + succinctBytes(expectedUsagedAfterReclaim)); + } + }; + struct { - uint64_t nodeCapacity; - uint64_t poolMaxCapacity; - bool isReclaimable; - uint64_t allocatedBytes; - uint64_t requestBytes; - bool expectedSuccess; - bool expectedReclaimedBytes; + std::vector testTasks; + double reclaimPct; + uint64_t targetBytes; std::string debugString() const { + std::stringstream tasksOss; + for (const auto& testTask : testTasks) { + tasksOss << "["; + tasksOss << testTask.debugString(); + tasksOss << "], \n"; + } return fmt::format( - "nodeCapacity {} poolMaxCapacity {} isReclaimable {} " - "allocatedBytes {} requestBytes {} expectedSuccess {} " - "expectedReclaimedBytes {}", - succinctBytes(nodeCapacity), - succinctBytes(poolMaxCapacity), - isReclaimable, - succinctBytes(allocatedBytes), - succinctBytes(requestBytes), - expectedSuccess, - expectedReclaimedBytes); + "testTasks: \n[{}], \reclaimPct: {}, targetBytes: {}", + tasksOss.str(), + reclaimPct, + succinctBytes(targetBytes)); } } testSettings[] = { - {256 * MB, 256 * MB, true, 128 * MB, 256 * MB, true, true}, - {256 * MB, 256 * MB, false, 128 * MB, 256 * MB, false, false}, - {256 * MB, 512 * MB, true, 128 * MB, 256 * MB, true, true}, - {256 * MB, 512 * MB, false, 128 * MB, 256 * MB, false, false}, - {256 * MB, 128 * MB, false, 128 * MB, 256 * MB, false, false}, - {256 * MB, 128 * MB, true, 128 * MB, 256 * MB, false, false}, - {256 * MB, 128 * MB, true, 128 * MB, 512 * MB, false, false}, - {256 * MB, 128 * MB, false, 128 * MB, 512 * MB, false, false}}; + {{{memoryCapacity / 2, 0, 0}, + {memoryCapacity / 4, memoryCapacity / 4, memoryCapacity / 4}, + {memoryCapacity / 4, memoryCapacity / 4, memoryCapacity / 4}}, + 1, + MB}, + {{{memoryCapacity / 4, 0, 0}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}}, + 1, + MB}, + {{{memoryCapacity / 2, 0, 0}, + {memoryCapacity / 4, memoryCapacity / 4, memoryCapacity / 4}, + {memoryCapacity / 4, memoryCapacity / 4, memoryCapacity / 4}}, + 0, + MB}, + {{{memoryCapacity / 4, 0, 0}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}, + {memoryCapacity / 8, memoryCapacity / 8, memoryCapacity / 8}}, + 0, + MB}, + {{{memoryCapacity / 2, 0, 0}, + {memoryCapacity / 4, 0, 0}, + {memoryCapacity / 4, 0, 0}}, + 100, + MB}, + {{{memoryCapacity / 2, 0, 0}, {memoryCapacity / 2, 0, 0}}, 60, MB}, + {{{memoryCapacity / 2, 0, 0}, + {memoryCapacity / 4, memoryCapacity / 4, memoryCapacity / 4}, + {memoryCapacity / 4, memoryCapacity / 4, memoryCapacity / 4}}, + 50, + MB}, + }; + + struct TestTaskContainer { + std::shared_ptr task; + MockMemoryOperator* op; + TestTask testTask; + }; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); - setupMemory(testData.nodeCapacity, 0, 0, 0); - auto requestor = addTask(testData.poolMaxCapacity); - auto* requestorOp = addMemoryOp(requestor, testData.isReclaimable); - requestorOp->allocate(testData.allocatedBytes); - const auto numRequests = arbitrator_->stats().numRequests; - if (testData.expectedSuccess) { - requestorOp->allocate(testData.requestBytes); - } else { - VELOX_ASSERT_THROW( - requestorOp->allocate(testData.requestBytes), - "Exceeded memory pool cap"); + setupMemory(memoryCapacity, 0, 0, 0, 0, 0, 0, 0, 0, 0, testData.reclaimPct); + std::vector taskContainers; + for (const auto& testTask : testData.testTasks) { + auto task = addTask(); + auto* op = addMemoryOp(task, true); + ASSERT_EQ(op->capacity(), 0); + if (testTask.capacity != 0) { + op->allocate(testTask.capacity); + } + ASSERT_EQ(task->capacity(), testTask.capacity); + ASSERT_LE(task->usedBytes(), testTask.capacity); + taskContainers.push_back({task, op, testTask}); } - if (testData.expectedSuccess) { - ASSERT_GT(arbitrator_->stats().reclaimedUsedBytes, 0); - } else { - ASSERT_EQ(arbitrator_->stats().reclaimedUsedBytes, 0); + + auto arbitrationTask = addTask(); + auto* arbitrationOp = arbitrationTask->addMemoryOp(true); + arbitrationOp->allocate(testData.targetBytes); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + arbitratorHelper.waitForGlobalArbitrationToFinish(); + + for (const auto& taskContainer : taskContainers) { + ASSERT_EQ( + taskContainer.task->pool()->capacity(), + taskContainer.testTask.expectedCapacityAfterReclaim); + ASSERT_EQ( + taskContainer.task->pool()->usedBytes(), + taskContainer.testTask.expectedCapacityAfterReclaim); } - ASSERT_EQ(arbitrator_->stats().numRequests, numRequests + 1); } } -TEST_F(MockSharedArbitrationTest, failedArbitration) { +TEST_F(MockSharedArbitrationTest, noEligibleAbortCandidate) { + uint64_t memoryCapacity = 256 * MB; + setupMemory(memoryCapacity, memoryCapacity / 2, 0, memoryCapacity / 4); + std::shared_ptr task = addTask(memoryCapacity); + ASSERT_EQ(task->capacity(), memoryCapacity / 4); + auto* op = task->addMemoryOp(true); + VELOX_ASSERT_THROW(op->allocate(memoryCapacity), "aborted"); + ASSERT_TRUE(task->pool()->aborted()); +} + +TEST_F(MockSharedArbitrationTest, growWithArbitrationAbort) { const int memCapacity = 256 * MB; const int minPoolCapacity = 8 * MB; setupMemory(memCapacity, 0, minPoolCapacity, 0); - auto* reclaimableOp = addMemoryOp(); + auto* reclaimableOp = addMemoryOp(nullptr, true); ASSERT_EQ(reclaimableOp->capacity(), minPoolCapacity); auto* nonReclaimableOp = addMemoryOp(nullptr, false); ASSERT_EQ(nonReclaimableOp->capacity(), minPoolCapacity); @@ -2108,73 +2905,96 @@ TEST_F(MockSharedArbitrationTest, failedArbitration) { ASSERT_EQ(reclaimableOp->capacity(), minPoolCapacity); nonReclaimableOp->allocate(minPoolCapacity); ASSERT_EQ(nonReclaimableOp->capacity(), minPoolCapacity); - VELOX_ASSERT_THROW( - arbitrateOp->allocate(memCapacity), "Exceeded memory pool cap"); + arbitrateOp->allocate(memCapacity); + ASSERT_TRUE(nonReclaimableOp->pool()->aborted()); verifyReclaimerStats(nonReclaimableOp->reclaimer()->stats()); verifyReclaimerStats(reclaimableOp->reclaimer()->stats(), 1); verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); verifyArbitratorStats( - arbitrator_->stats(), memCapacity, 260046848, 0, 1, 1, 8388608, 8388608); + arbitrator_->stats(), + memCapacity, + 0, + 0, + 1, + 0, + minPoolCapacity * 2, + 8388608); } TEST_F(MockSharedArbitrationTest, singlePoolGrowCapacityWithArbitration) { - const std::vector isLeafReclaimables = {true, false}; + const std::vector isLeafReclaimables = {false, true}; + const uint64_t memoryCapacity = 128 * MB; for (const auto isLeafReclaimable : isLeafReclaimables) { SCOPED_TRACE(fmt::format("isLeafReclaimable {}", isLeafReclaimable)); - setupMemory(); - auto op = addMemoryOp(nullptr, isLeafReclaimable); - const int allocateSize = MB; - while (op->pool()->usedBytes() < - kMemoryCapacity - kReservedMemoryCapacity) { - op->allocate(allocateSize); - } - verifyArbitratorStats( - arbitrator_->stats(), - kMemoryCapacity, - kReservedMemoryCapacity, - kReservedMemoryCapacity, - 13); - verifyReclaimerStats(op->reclaimer()->stats(), 0, 13); + setupMemory(memoryCapacity); + auto* op = addMemoryOp(nullptr, isLeafReclaimable); + op->allocate(memoryCapacity); + verifyArbitratorStats(arbitrator_->stats(), memoryCapacity, 0, 0, 1); + verifyReclaimerStats(op->reclaimer()->stats(), 0, 1); if (!isLeafReclaimable) { VELOX_ASSERT_THROW( - op->allocate(allocateSize), "Exceeded memory pool cap"); - verifyArbitratorStats( - arbitrator_->stats(), - kMemoryCapacity, - kReservedMemoryCapacity, - kReservedMemoryCapacity, - 14, - 1); - verifyReclaimerStats(op->reclaimer()->stats(), 0, 14); + op->allocate(memoryCapacity), "Exceeded memory pool cap"); + verifyArbitratorStats(arbitrator_->stats(), memoryCapacity, 0, 0, 2, 1); + verifyReclaimerStats(op->reclaimer()->stats(), 0, 2); + clearTasks(); continue; } // Do more allocations to trigger arbitration. - op->allocate( - op->pool()->capacity() - op->pool()->root()->reservedBytes() + MB); + op->allocate(memoryCapacity); verifyArbitratorStats( - arbitrator_->stats(), - kMemoryCapacity, - kReservedMemoryCapacity, - kReservedMemoryCapacity, - 14, - 0, - 8388608); - verifyReclaimerStats(op->reclaimer()->stats(), 1, 14); + arbitrator_->stats(), memoryCapacity, 0, 0, 2, 0, memoryCapacity); + verifyReclaimerStats(op->reclaimer()->stats(), 1, 2); clearTasks(); verifyArbitratorStats( arbitrator_->stats(), - kMemoryCapacity, - kMemoryCapacity, - kReservedMemoryCapacity, - 14, + memoryCapacity, + memoryCapacity, + 0, + 2, 0, - 8388608); + memoryCapacity); } } +// This test verifies if a single memory pool fails to grow capacity because of +// reserved capacity. +// TODO: add reserved capacity check in ensure capacity. +TEST_F(MockSharedArbitrationTest, singlePoolGrowCapacityFailedWithAbort) { + const uint64_t memoryCapacity = 128 * MB; + const uint64_t reservedMemoryCapacity = 64 * MB; + const uint64_t memoryPoolReservedCapacity = 64 * MB; + setupMemory( + memoryCapacity, reservedMemoryCapacity, 0, memoryPoolReservedCapacity); + auto* op = addMemoryOp(nullptr, true); + op->allocate(memoryCapacity - reservedMemoryCapacity); + verifyArbitratorStats( + arbitrator_->stats(), + memoryCapacity, + reservedMemoryCapacity, + reservedMemoryCapacity, + 0); + verifyReclaimerStats(op->reclaimer()->stats(), 0, 0); + + // Do more allocations to trigger arbitration. + try { + op->allocate(memoryCapacity); + } catch (const VeloxRuntimeError& ex) { + ASSERT_EQ(ex.errorCode(), error_code::kMemAborted.c_str()); + } + verifyArbitratorStats( + arbitrator_->stats(), + memoryCapacity, + memoryCapacity, + reservedMemoryCapacity, + 1, + 1, + 64 * MB); + verifyReclaimerStats(op->reclaimer()->stats(), 1, 1); +} + TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { const std::vector isLeafReclaimables = {true, false}; for (const auto isLeafReclaimable : isLeafReclaimables) { @@ -2201,8 +3021,8 @@ TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { ASSERT_GT(arbitratorStats.reclaimedFreeBytes, 0); ASSERT_EQ(arbitratorStats.reclaimedUsedBytes, 0); - verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 0, 8); - verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); + verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 0, 11); + verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 6); clearTasks(); } @@ -2211,22 +3031,26 @@ TEST_F(MockSharedArbitrationTest, arbitrateWithCapacityShrink) { TEST_F(MockSharedArbitrationTest, arbitrateWithMemoryReclaim) { const uint64_t memoryCapacity = 256 * MB; const uint64_t reservedMemoryCapacity = 128 * MB; - const uint64_t initPoolCapacity = 8 * MB; const uint64_t reservedPoolCapacity = 8 * MB; + const uint64_t memoryPoolAbortCapacityLimit = 32 * MB; const std::vector isLeafReclaimables = {true, false}; for (const auto isLeafReclaimable : isLeafReclaimables) { SCOPED_TRACE(fmt::format("isLeafReclaimable {}", isLeafReclaimable)); setupMemory( memoryCapacity, reservedMemoryCapacity, - initPoolCapacity, - reservedPoolCapacity); + 0, + reservedPoolCapacity, + 0, + 0, + 0, + 0, + 0, + memoryPoolAbortCapacityLimit); auto* reclaimedOp = addMemoryOp(nullptr, isLeafReclaimable); - const int allocateSize = 8 * MB; - while (reclaimedOp->pool()->usedBytes() < - memoryCapacity - reservedMemoryCapacity) { - reclaimedOp->allocate(allocateSize); - } + reclaimedOp->allocate( + memoryCapacity - reservedMemoryCapacity - reservedPoolCapacity); + auto* arbitrateOp = addMemoryOp(); if (!isLeafReclaimable) { auto leafTask = tasks().front(); @@ -2234,23 +3058,12 @@ TEST_F(MockSharedArbitrationTest, arbitrateWithMemoryReclaim) { ASSERT_NE(leafTask->error(), nullptr); ASSERT_EQ(arbitrator_->stats().numFailures, 0); + clearTasks(); continue; } - arbitrateOp->allocate(reservedMemoryCapacity / 2); - - verifyArbitratorStats( - arbitrator_->stats(), - memoryCapacity, - kReservedMemoryCapacity - reservedPoolCapacity, - kReservedMemoryCapacity - reservedPoolCapacity, - 10, - 0, - 58720256, - 10559488); - + arbitrateOp->allocate(reservedMemoryCapacity - reservedPoolCapacity); verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1, 0); - - verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 1, 9, 0); + verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 1, 1, 0); clearTasks(); } } @@ -2270,8 +3083,7 @@ TEST_F(MockSharedArbitrationTest, arbitrateBySelfMemoryReclaim) { while (memOp->pool()->usedBytes() < memCapacity / 2) { memOp->allocate(allocateSize); } - // Extra free bytes due to fast/slow grow strategy - ASSERT_EQ(memOp->pool()->freeBytes(), 14811136); + ASSERT_EQ(memOp->pool()->freeBytes(), 0); const int oldNumRequests = arbitrator_->stats().numRequests; // Allocate a large chunk of memory to trigger arbitration. if (!isLeafReclaimable) { @@ -2335,21 +3147,22 @@ TEST_F(MockSharedArbitrationTest, noAbortOnRequestWhenArbitrationFails) { DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, orderedArbitration) { SCOPED_TESTVALUE_SET( "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", - std::function*)>( - ([&](const std::vector* candidates) { + std::function*)>( + ([&](const std::vector* candidates) { for (int i = 1; i < candidates->size(); ++i) { ASSERT_LE( - (*candidates)[i].freeBytes, (*candidates)[i - 1].freeBytes); + (*candidates)[i].reclaimableFreeCapacity, + (*candidates)[i - 1].reclaimableFreeCapacity); } }))); SCOPED_TESTVALUE_SET( "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableUsedCapacity", - std::function*)>( - ([&](const std::vector* candidates) { + std::function*)>( + ([&](const std::vector* candidates) { for (int i = 1; i < candidates->size(); ++i) { ASSERT_LE( - (*candidates)[i].reclaimableBytes, - (*candidates)[i - 1].reclaimableBytes); + (*candidates)[i].reclaimableUsedCapacity, + (*candidates)[i - 1].reclaimableUsedCapacity); } }))); @@ -2432,17 +3245,20 @@ TEST_F(MockSharedArbitrationTest, enterArbitrationException) { ASSERT_EQ(failedArbitrateOp->capacity(), 0); auto* arbitrateOp = addMemoryOp(); arbitrateOp->allocate(allocationSize); + + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + arbitratorHelper.waitForGlobalArbitrationToFinish(); ASSERT_EQ(arbitrateOp->capacity(), allocationSize); verifyReclaimerStats(arbitrateOp->reclaimer()->stats(), 0, 1); verifyReclaimerStats(reclaimedOp->reclaimer()->stats(), 1); - ASSERT_EQ(arbitrator_->stats().reclaimedUsedBytes, allocationSize); + ASSERT_EQ(arbitrator_->stats().reclaimedUsedBytes, memCapacity); ASSERT_EQ(arbitrator_->stats().numRequests, 1); ASSERT_EQ(arbitrator_->stats().numFailures, 0); } TEST_F(MockSharedArbitrationTest, noArbitratiognFromAbortedPool) { auto* reclaimedOp = addMemoryOp(); - ASSERT_EQ(reclaimedOp->capacity(), kMemoryPoolInitCapacity); + ASSERT_EQ(reclaimedOp->capacity(), 0); reclaimedOp->allocate(128); try { VELOX_MEM_POOL_ABORTED("Manual abort pool"); @@ -2453,408 +3269,25 @@ TEST_F(MockSharedArbitrationTest, noArbitratiognFromAbortedPool) { ASSERT_TRUE(reclaimedOp->pool()->aborted()); const int largeAllocationSize = 2 * kMemoryPoolInitCapacity; VELOX_ASSERT_THROW(reclaimedOp->allocate(largeAllocationSize), ""); - ASSERT_EQ(arbitrator_->stats().numRequests, 0); + ASSERT_EQ(arbitrator_->stats().numRequests, 1); ASSERT_EQ(arbitrator_->stats().numAborted, 0); ASSERT_EQ(arbitrator_->stats().numFailures, 0); // Check we don't allow memory reservation increase or trigger memory // arbitration at root memory pool. - ASSERT_EQ(reclaimedOp->pool()->capacity(), kMemoryPoolInitCapacity); + ASSERT_EQ(reclaimedOp->pool()->capacity(), MB); ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); VELOX_ASSERT_THROW(reclaimedOp->allocate(128), ""); ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); - ASSERT_EQ(reclaimedOp->pool()->capacity(), kMemoryPoolInitCapacity); - VELOX_ASSERT_THROW(reclaimedOp->allocate(kMemoryPoolInitCapacity * 2), ""); - ASSERT_EQ(reclaimedOp->pool()->capacity(), kMemoryPoolInitCapacity); + ASSERT_EQ(reclaimedOp->pool()->capacity(), MB); + VELOX_ASSERT_THROW(reclaimedOp->allocate(MB), "Manual abort pool"); + ASSERT_EQ(reclaimedOp->pool()->capacity(), MB); ASSERT_EQ(reclaimedOp->pool()->usedBytes(), 0); - ASSERT_EQ(arbitrator_->stats().numRequests, 0); + ASSERT_EQ(arbitrator_->stats().numRequests, 1); ASSERT_EQ(arbitrator_->stats().numAborted, 0); ASSERT_EQ(arbitrator_->stats().numFailures, 0); } -DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromRequestor) { - const int numOtherTasks = 4; - const int otherTaskMemoryCapacity = kMemoryCapacity / 8; - const int failedTaskMemoryCapacity = kMemoryCapacity / 2; - struct { - bool hasAllocationFromFailedTaskAfterAbort; - bool hasAllocationFromOtherTaskAfterAbort; - int64_t expectedFailedTaskMemoryCapacity; - int64_t expectedFailedTaskMemoryUsage; - int64_t expectedOtherTaskMemoryCapacity; - int64_t expectedOtherTaskMemoryUsage; - int64_t expectedFreeCapacity; - - std::string debugString() const { - return fmt::format( - "hasAllocationFromFailedTaskAfterAbort {}, hasAllocationFromOtherTaskAfterAbort {} expectedFailedTaskMemoryCapacity {} expectedFailedTaskMemoryUsage {} expectedOtherTaskMemoryCapacity {} expectedOtherTaskMemoryUsage {} expectedFreeCapacity{}", - hasAllocationFromFailedTaskAfterAbort, - hasAllocationFromOtherTaskAfterAbort, - expectedFailedTaskMemoryCapacity, - expectedFailedTaskMemoryUsage, - expectedOtherTaskMemoryCapacity, - expectedOtherTaskMemoryUsage, - expectedFreeCapacity); - } - } testSettings[] = { - {false, - false, - 0, - 0, - otherTaskMemoryCapacity, - otherTaskMemoryCapacity, - failedTaskMemoryCapacity}, - {true, - false, - 0, - 0, - otherTaskMemoryCapacity, - otherTaskMemoryCapacity, - failedTaskMemoryCapacity}, - {true, - true, - 0, - 0, - otherTaskMemoryCapacity * 2, - otherTaskMemoryCapacity * 2, - 0}, - {false, - true, - 0, - 0, - otherTaskMemoryCapacity * 2, - otherTaskMemoryCapacity * 2, - 0}}; - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - setupMemory(kMemoryCapacity, 0, kMemoryPoolInitCapacity, 0, 0, 0, 0, 0); - - std::vector> otherTasks; - std::vector otherTaskOps; - for (int i = 0; i < numOtherTasks; ++i) { - otherTasks.push_back(addTask()); - otherTaskOps.push_back(addMemoryOp(otherTasks.back(), false)); - otherTaskOps.back()->allocate(otherTaskMemoryCapacity); - ASSERT_EQ( - otherTasks.back()->pool()->usedBytes(), otherTaskMemoryCapacity); - } - std::shared_ptr failedTask = addTask(); - MockMemoryOperator* failedTaskOp = addMemoryOp( - failedTask, true, [&](MemoryPool* /*unsed*/, uint64_t /*unsed*/) { - VELOX_FAIL("throw reclaim exception"); - }); - failedTaskOp->allocate(failedTaskMemoryCapacity); - for (int i = 0; i < numOtherTasks; ++i) { - ASSERT_EQ(otherTaskOps[0]->pool()->capacity(), otherTaskMemoryCapacity); - } - ASSERT_EQ(failedTaskOp->capacity(), failedTaskMemoryCapacity); - - const auto oldStats = arbitrator_->stats(); - ASSERT_EQ(oldStats.numFailures, 0); - ASSERT_EQ(oldStats.numAborted, 0); - - const int numFailedTaskAllocationsAfterAbort = - testData.hasAllocationFromFailedTaskAfterAbort ? 3 : 0; - // If 'hasAllocationFromOtherTaskAfterAbort' is true, then one allocation - // from each of the other tasks. - const int numOtherAllocationsAfterAbort = - testData.hasAllocationFromOtherTaskAfterAbort ? numOtherTasks : 0; - - // One barrier count is for the initial allocation from the failed task to - // trigger memory arbitration. - folly::futures::Barrier arbitrationStartBarrier( - numFailedTaskAllocationsAfterAbort + numOtherAllocationsAfterAbort + 1); - folly::futures::Barrier arbitrationBarrier( - numFailedTaskAllocationsAfterAbort + numOtherAllocationsAfterAbort + 1); - std::atomic_int testInjectionCount{0}; - std::atomic_bool arbitrationStarted{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::startArbitration", - std::function( - ([&](const SharedArbitrator* /*unused*/) { - if (!arbitrationStarted) { - return; - } - if (++testInjectionCount <= numFailedTaskAllocationsAfterAbort + - numOtherAllocationsAfterAbort + 1) { - arbitrationBarrier.wait().wait(); - } - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", - std::function*)>( - ([&](const std::vector* /*unused*/) { - if (!arbitrationStarted.exchange(true)) { - arbitrationStartBarrier.wait().wait(); - } - if (++testInjectionCount <= numFailedTaskAllocationsAfterAbort + - numOtherAllocationsAfterAbort + 1) { - arbitrationBarrier.wait().wait(); - } - }))); - - std::vector allocationThreadsAfterAbort; - for (int i = 0; - i < numFailedTaskAllocationsAfterAbort + numOtherAllocationsAfterAbort; - ++i) { - allocationThreadsAfterAbort.emplace_back([&, i]() { - arbitrationStartBarrier.wait().wait(); - if (i < numFailedTaskAllocationsAfterAbort) { - VELOX_ASSERT_THROW( - failedTaskOp->allocate(failedTaskMemoryCapacity), - "The requestor pool has been aborted"); - } else { - otherTaskOps[i - numFailedTaskAllocationsAfterAbort]->allocate( - otherTaskMemoryCapacity); - } - }); - } - - // Trigger memory arbitration to reclaim from itself which throws. - VELOX_ASSERT_THROW( - failedTaskOp->allocate(failedTaskMemoryCapacity), - "The requestor pool has been aborted"); - // Wait for all the allocation threads to complete. - for (auto& allocationThread : allocationThreadsAfterAbort) { - allocationThread.join(); - } - ASSERT_TRUE(failedTaskOp->pool()->aborted()); - ASSERT_EQ( - failedTaskOp->pool()->usedBytes(), - testData.expectedFailedTaskMemoryCapacity); - ASSERT_EQ( - failedTaskOp->pool()->capacity(), - testData.expectedFailedTaskMemoryUsage); - ASSERT_EQ(failedTaskOp->reclaimer()->stats().numAborts, 1); - ASSERT_EQ(failedTaskOp->reclaimer()->stats().numReclaims, 1); - - const auto newStats = arbitrator_->stats(); - ASSERT_EQ( - newStats.numRequests, - oldStats.numRequests + 1 + numFailedTaskAllocationsAfterAbort + - numOtherAllocationsAfterAbort); - ASSERT_EQ(newStats.numAborted, 1); - ASSERT_EQ(newStats.freeCapacityBytes, testData.expectedFreeCapacity); - ASSERT_EQ(newStats.numFailures, numFailedTaskAllocationsAfterAbort + 1); - ASSERT_EQ(newStats.maxCapacityBytes, kMemoryCapacity); - // Check if memory pools have been aborted or not as expected. - for (const auto* taskOp : otherTaskOps) { - ASSERT_FALSE(taskOp->pool()->aborted()); - ASSERT_EQ(taskOp->reclaimer()->stats().numAborts, 0); - ASSERT_EQ(taskOp->reclaimer()->stats().numReclaims, 0); - ASSERT_EQ( - taskOp->pool()->capacity(), testData.expectedOtherTaskMemoryCapacity); - ASSERT_EQ( - taskOp->pool()->usedBytes(), testData.expectedOtherTaskMemoryUsage); - } - - VELOX_ASSERT_THROW(failedTaskOp->allocate(failedTaskMemoryCapacity), ""); - ASSERT_EQ(arbitrator_->stats().numRequests, newStats.numRequests); - ASSERT_EQ(arbitrator_->stats().numAborted, 1); - } -} - -DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, failedToReclaimFromOtherTask) { - const int numNonFailedTasks = 3; - const int nonFailTaskMemoryCapacity = kMemoryCapacity / 8; - const int failedTaskMemoryCapacity = - kMemoryCapacity / 2 + nonFailTaskMemoryCapacity; - struct { - bool hasAllocationFromFailedTaskAfterAbort; - bool hasAllocationFromNonFailedTaskAfterAbort; - int64_t expectedFailedTaskMemoryCapacity; - int64_t expectedFailedTaskMemoryUsage; - int64_t expectedNonFailedTaskMemoryCapacity; - int64_t expectedNonFailedTaskMemoryUsage; - int64_t expectedFreeCapacity; - - std::string debugString() const { - return fmt::format( - "hasAllocationFromFailedTaskAfterAbort {}, hasAllocationFromNonFailedTaskAfterAbort {} expectedFailedTaskMemoryCapacity {} expectedFailedTaskMemoryUsage {} expectedNonFailedTaskMemoryCapacity {} expectedNonFailedTaskMemoryUsage {} expectedFreeCapacity {}", - hasAllocationFromFailedTaskAfterAbort, - hasAllocationFromNonFailedTaskAfterAbort, - expectedFailedTaskMemoryCapacity, - expectedFailedTaskMemoryUsage, - expectedNonFailedTaskMemoryCapacity, - expectedNonFailedTaskMemoryUsage, - expectedFreeCapacity); - } - } testSettings[] = { - {false, - false, - 0, - 0, - nonFailTaskMemoryCapacity, - nonFailTaskMemoryCapacity, - failedTaskMemoryCapacity - nonFailTaskMemoryCapacity}, - {true, - false, - 0, - 0, - nonFailTaskMemoryCapacity, - nonFailTaskMemoryCapacity, - failedTaskMemoryCapacity - nonFailTaskMemoryCapacity}, - {true, - true, - 0, - 0, - nonFailTaskMemoryCapacity * 2, - nonFailTaskMemoryCapacity * 2, - nonFailTaskMemoryCapacity}, - {false, - true, - 0, - 0, - nonFailTaskMemoryCapacity * 2, - nonFailTaskMemoryCapacity * 2, - nonFailTaskMemoryCapacity}}; - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - setupMemory(kMemoryCapacity, 0, kMemoryPoolInitCapacity, 0, 0, 0, 0, 0); - - std::vector> nonFailedTasks; - std::vector nonFailedTaskOps; - for (int i = 0; i < numNonFailedTasks; ++i) { - nonFailedTasks.push_back(addTask()); - nonFailedTaskOps.push_back(addMemoryOp(nonFailedTasks.back(), false)); - nonFailedTaskOps.back()->allocate(nonFailTaskMemoryCapacity); - ASSERT_EQ( - nonFailedTasks.back()->pool()->usedBytes(), - nonFailTaskMemoryCapacity); - } - std::shared_ptr failedTask = addTask(); - MockMemoryOperator* failedTaskOp = addMemoryOp( - failedTask, true, [&](MemoryPool* /*unsed*/, uint64_t /*unsed*/) { - VELOX_FAIL("throw reclaim exception"); - }); - failedTaskOp->allocate(failedTaskMemoryCapacity); - for (int i = 0; i < numNonFailedTasks; ++i) { - ASSERT_EQ( - nonFailedTasks[0]->pool()->capacity(), nonFailTaskMemoryCapacity) - << i; - } - ASSERT_EQ(failedTaskOp->capacity(), failedTaskMemoryCapacity); - - const auto oldStats = arbitrator_->stats(); - ASSERT_EQ(oldStats.numFailures, 0); - ASSERT_EQ(oldStats.numAborted, 0); - - const int numFailedTaskAllocationsAfterAbort = - testData.hasAllocationFromFailedTaskAfterAbort ? 3 : 0; - // If 'hasAllocationFromOtherTaskAfterAbort' is true, then one allocation - // from each of the other tasks. - const int numNonFailedAllocationsAfterAbort = - testData.hasAllocationFromNonFailedTaskAfterAbort ? numNonFailedTasks - : 0; - // One barrier count is for the initial allocation from the failed task to - // trigger memory arbitration. - folly::futures::Barrier arbitrationStartBarrier( - numFailedTaskAllocationsAfterAbort + numNonFailedAllocationsAfterAbort + - 1); - folly::futures::Barrier arbitrationBarrier( - numFailedTaskAllocationsAfterAbort + numNonFailedAllocationsAfterAbort + - 1); - std::atomic testInjectionCount{0}; - std::atomic arbitrationStarted{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::startArbitration", - std::function( - ([&](const SharedArbitrator* /*unsed*/) { - if (!arbitrationStarted) { - return; - } - if (++testInjectionCount <= numFailedTaskAllocationsAfterAbort + - numNonFailedAllocationsAfterAbort + 1) { - arbitrationBarrier.wait().wait(); - } - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableFreeCapacity", - std::function*)>( - ([&](const std::vector* /*unused*/) { - if (!arbitrationStarted.exchange(true)) { - arbitrationStartBarrier.wait().wait(); - } - if (++testInjectionCount <= numFailedTaskAllocationsAfterAbort + - numNonFailedAllocationsAfterAbort + 1) { - arbitrationBarrier.wait().wait(); - } - }))); - - std::vector allocationThreadsAfterAbort; - for (int i = 0; i < - numFailedTaskAllocationsAfterAbort + numNonFailedAllocationsAfterAbort; - ++i) { - allocationThreadsAfterAbort.emplace_back([&, i]() { - arbitrationStartBarrier.wait().wait(); - if (i < numFailedTaskAllocationsAfterAbort) { - VELOX_ASSERT_THROW( - failedTaskOp->allocate(failedTaskMemoryCapacity), ""); - } else { - nonFailedTaskOps[i - numFailedTaskAllocationsAfterAbort]->allocate( - nonFailTaskMemoryCapacity); - } - }); - } - - // Trigger memory arbitration to reclaim from failedTask which throws. - nonFailedTaskOps[0]->allocate(nonFailTaskMemoryCapacity); - // Wait for all the allocation threads to complete. - for (auto& allocationThread : allocationThreadsAfterAbort) { - allocationThread.join(); - } - ASSERT_TRUE(failedTaskOp->pool()->aborted()); - ASSERT_EQ( - failedTaskOp->pool()->usedBytes(), - testData.expectedFailedTaskMemoryCapacity); - ASSERT_EQ( - failedTaskOp->pool()->capacity(), - testData.expectedFailedTaskMemoryUsage); - ASSERT_EQ(failedTaskOp->reclaimer()->stats().numAborts, 1); - ASSERT_EQ(failedTaskOp->reclaimer()->stats().numReclaims, 1); - - const auto newStats = arbitrator_->stats(); - ASSERT_EQ( - newStats.numRequests, - oldStats.numRequests + 1 + numFailedTaskAllocationsAfterAbort + - numNonFailedAllocationsAfterAbort); - ASSERT_EQ(newStats.numAborted, 1); - ASSERT_EQ(newStats.freeCapacityBytes, testData.expectedFreeCapacity); - ASSERT_EQ(newStats.numFailures, numFailedTaskAllocationsAfterAbort); - ASSERT_EQ(newStats.maxCapacityBytes, kMemoryCapacity); - // Check if memory pools have been aborted or not as expected. - for (int i = 0; i < nonFailedTaskOps.size(); ++i) { - auto* taskOp = nonFailedTaskOps[i]; - ASSERT_FALSE(taskOp->pool()->aborted()); - ASSERT_EQ(taskOp->reclaimer()->stats().numAborts, 0); - ASSERT_EQ(taskOp->reclaimer()->stats().numReclaims, 0); - if (i == 0) { - ASSERT_EQ( - taskOp->pool()->capacity(), - testData.expectedNonFailedTaskMemoryCapacity + - nonFailTaskMemoryCapacity); - ASSERT_EQ( - taskOp->pool()->usedBytes(), - testData.expectedNonFailedTaskMemoryUsage + - nonFailTaskMemoryCapacity); - } else { - ASSERT_EQ( - taskOp->pool()->capacity(), - testData.expectedNonFailedTaskMemoryCapacity); - ASSERT_EQ( - taskOp->pool()->usedBytes(), - testData.expectedNonFailedTaskMemoryUsage); - } - } - - VELOX_ASSERT_THROW(failedTaskOp->allocate(failedTaskMemoryCapacity), ""); - ASSERT_EQ(arbitrator_->stats().numRequests, newStats.numRequests); - ASSERT_EQ(arbitrator_->stats().numAborted, 1); - } -} - -TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { +TEST_F(MockSharedArbitrationTest, memoryReclaimeFailureTriggeredAbort) { setupMemory( kMemoryCapacity, 0, @@ -2878,6 +3311,7 @@ TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { MockMemoryOperator* largeTaskOp = addMemoryOp( largeTask, true, [&](MemoryPool* /*unsed*/, uint64_t /*unsed*/) { VELOX_FAIL("throw reclaim exception"); + return false; }); largeTaskOp->allocate(largeTaskMemoryCapacity); const auto oldStats = arbitrator_->stats(); @@ -2885,12 +3319,12 @@ TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { ASSERT_EQ(oldStats.numAborted, 0); // Trigger memory arbitration to reclaim from itself which throws. - VELOX_ASSERT_THROW( - largeTaskOp->allocate(largeTaskMemoryCapacity), - "The requestor pool has been aborted"); + VELOX_ASSERT_THROW(largeTaskOp->allocate(largeTaskMemoryCapacity), "aborted"); + test::SharedArbitratorTestHelper arbitratorHelper(arbitrator_); + arbitratorHelper.waitForGlobalArbitrationToFinish(); const auto newStats = arbitrator_->stats(); ASSERT_EQ(newStats.numRequests, oldStats.numRequests + 1); - ASSERT_EQ(newStats.numAborted, 1); + ASSERT_EQ(newStats.numAborted, 0); ASSERT_EQ(newStats.freeCapacityBytes, largeTaskMemoryCapacity); ASSERT_EQ(newStats.maxCapacityBytes, kMemoryCapacity); // Check if memory pools have been aborted or not as expected. @@ -2904,35 +3338,36 @@ TEST_F(MockSharedArbitrationTest, memoryPoolAbortThrow) { ASSERT_EQ(largeTaskOp->reclaimer()->stats().numReclaims, 1); VELOX_ASSERT_THROW(largeTaskOp->allocate(largeTaskMemoryCapacity), ""); ASSERT_EQ(arbitrator_->stats().numRequests, newStats.numRequests); - ASSERT_EQ(arbitrator_->stats().numAborted, 1); + ASSERT_EQ(arbitrator_->stats().numAborted, 0); } // This test makes sure the memory capacity grows as expected. DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, concurrentArbitrationRequests) { - setupMemory(kMemoryCapacity, 0, 0, 0); + setupMemory(kMemoryCapacity); std::shared_ptr task = addTask(); MockMemoryOperator* op1 = addMemoryOp(task); MockMemoryOperator* op2 = addMemoryOp(task); - std::atomic_bool arbitrationWaitFlag{true}; - folly::EventCount arbitrationWait; std::atomic_bool injectOnce{true}; SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::startArbitration", + "facebook::velox::memory::SharedArbitrator::growCapacity", std::function( ([&](const SharedArbitrator* arbitrator) { if (!injectOnce.exchange(false)) { return; } - arbitrationWaitFlag = false; - arbitrationWait.notifyAll(); - while (arbitrator->testingNumRequests() != 2) { + test::SharedArbitratorTestHelper arbitratorHelper( + const_cast(arbitrator)); + auto participant = + arbitratorHelper.getParticipant(task->pool()->name()); + test::ArbitrationParticipantTestHelper participantHelper( + participant.get()); + while (participantHelper.numOps() != 2) { std::this_thread::sleep_for(std::chrono::seconds(5)); // NOLINT } }))); std::thread firstArbitrationThread([&]() { op1->allocate(64 << 20); }); - std::thread secondArbitrationThread([&]() { op2->allocate(64 << 20); }); firstArbitrationThread.join(); @@ -2944,7 +3379,7 @@ DEBUG_ONLY_TEST_F(MockSharedArbitrationTest, concurrentArbitrationRequests) { DEBUG_ONLY_TEST_F( MockSharedArbitrationTest, freeUnusedCapacityWhenReclaimMemoryPool) { - setupMemory(kMemoryCapacity, 0, 0, 0); + setupMemory(kMemoryCapacity); const int allocationSize = kMemoryCapacity / 4; std::shared_ptr reclaimedTask = addTask(); MockMemoryOperator* reclaimedTaskOp = addMemoryOp(reclaimedTask); @@ -2955,14 +3390,15 @@ DEBUG_ONLY_TEST_F( std::shared_ptr arbitrationTask = addTask(); MockMemoryOperator* arbitrationTaskOp = addMemoryOp(arbitrationTask); folly::EventCount reclaimWait; - auto reclaimWaitKey = reclaimWait.prepareWait(); + std::atomic_bool reclaimWaitFlag{true}; folly::EventCount reclaimBlock; - auto reclaimBlockKey = reclaimBlock.prepareWait(); + std::atomic_bool reclaimBlockFlag{true}; SCOPED_TESTVALUE_SET( "facebook::velox::memory::SharedArbitrator::sortCandidatesByReclaimableUsedCapacity", std::function(([&](const MemoryPool* /*unsed*/) { - reclaimWait.notify(); - reclaimBlock.wait(reclaimBlockKey); + reclaimWaitFlag = false; + reclaimWait.notifyAll(); + reclaimBlock.await([&]() { return !reclaimBlockFlag.load(); }); }))); const auto oldStats = arbitrator_->stats(); @@ -2972,58 +3408,21 @@ DEBUG_ONLY_TEST_F( arbitrationTaskOp->allocate(allocationSize); }); - reclaimWait.wait(reclaimWaitKey); + reclaimWait.await([&]() { return !reclaimWaitFlag.load(); }); reclaimedTaskOp->free(bufferToFree); - reclaimBlock.notify(); + reclaimBlockFlag = false; + reclaimBlock.notifyAll(); + allocThread.join(); const auto stats = arbitrator_->stats(); ASSERT_EQ(stats.numFailures, 0); ASSERT_EQ(stats.numAborted, 0); ASSERT_EQ(stats.numRequests, oldStats.numRequests + 1); - // We count the freed capacity in reclaimed bytes. - ASSERT_EQ(stats.reclaimedUsedBytes, 0); - ASSERT_EQ(reclaimedTaskOp->capacity(), kMemoryCapacity - allocationSize); + ASSERT_EQ(stats.reclaimedUsedBytes, kMemoryCapacity); + ASSERT_EQ(reclaimedTaskOp->capacity(), 0); ASSERT_EQ(arbitrationTaskOp->capacity(), allocationSize); } -DEBUG_ONLY_TEST_F( - MockSharedArbitrationTest, - raceBetweenInitialReservationAndArbitration) { - std::shared_ptr arbitrationTask = addTask(kMemoryCapacity); - MockMemoryOperator* arbitrationTaskOp = addMemoryOp(arbitrationTask); - ASSERT_EQ(arbitrationTask->pool()->capacity(), kMemoryPoolInitCapacity); - - folly::EventCount arbitrationRun; - auto arbitrationRunKey = arbitrationRun.prepareWait(); - folly::EventCount arbitrationBlock; - auto arbitrationBlockKey = arbitrationBlock.prepareWait(); - - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::SharedArbitrator::startArbitration", - std::function( - ([&](const SharedArbitrator* /*unsed*/) { - arbitrationRun.notify(); - arbitrationBlock.wait(arbitrationBlockKey); - }))); - - std::thread allocThread([&]() { - // Allocate more than its capacity to trigger arbitration which is blocked - // by the arbitration testvalue injection above. - arbitrationTaskOp->allocate(2 * kMemoryPoolInitCapacity); - }); - - arbitrationRun.wait(arbitrationRunKey); - - // Allocate a new root memory pool and check it has its initial capacity - // allocated. - std::shared_ptr skipTask = addTask(kMemoryCapacity); - MockMemoryOperator* skipTaskOp = addMemoryOp(skipTask); - ASSERT_EQ(skipTaskOp->pool()->capacity(), kMemoryPoolInitCapacity); - - arbitrationBlock.notify(); - allocThread.join(); -} - TEST_F(MockSharedArbitrationTest, arbitrationFailure) { int64_t maxCapacity = 128 * MB; int64_t initialCapacity = 0 * MB; @@ -3044,10 +3443,11 @@ TEST_F(MockSharedArbitrationTest, arbitrationFailure) { expectedRequestorAborted); } } testSettings[] = { - {64 * MB, 64 * MB, 32 * MB, false, false}, - {64 * MB, 48 * MB, 32 * MB, false, false}, - {32 * MB, 64 * MB, 64 * MB, false, false}, - {32 * MB, 32 * MB, 96 * MB, true, false}}; + {64 * MB, 64 * MB, 32 * MB, true, false}, + {64 * MB, 48 * MB, 32 * MB, true, false}, + {32 * MB, 64 * MB, 64 * MB, true, false}, + {32 * MB, 32 * MB, 96 * MB, true, false}, + {64 * MB, 96 * MB, 32 * MB, false, false}}; for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); @@ -3057,6 +3457,7 @@ TEST_F(MockSharedArbitrationTest, arbitrationFailure) { MockMemoryOperator* requestorOp = addMemoryOp(requestorTask, false); requestorOp->allocate(testData.requestorCapacity); ASSERT_EQ(requestorOp->capacity(), testData.requestorCapacity); + std::shared_ptr otherTask = addTask(); MockMemoryOperator* otherOp = addMemoryOp(otherTask, false); otherOp->allocate(testData.otherCapacity); @@ -3073,7 +3474,8 @@ TEST_F(MockSharedArbitrationTest, arbitrationFailure) { ASSERT_TRUE(otherOp->pool()->aborted()); } else { VELOX_ASSERT_THROW( - requestorOp->allocate(testData.requestorRequestBytes), ""); + requestorOp->allocate(testData.requestorRequestBytes), + "Exceeded memory pool capacity after attempt"); ASSERT_FALSE(requestorOp->pool()->aborted()); ASSERT_FALSE(otherOp->pool()->aborted()); } @@ -3107,7 +3509,9 @@ TEST_F(MockSharedArbitrationTest, concurrentArbitrations) { [&](MemoryPool* /*unused*/, uint64_t /*unused*/) { if (folly::Random::oneIn(10)) { VELOX_FAIL(injectReclaimErrorMessage); + return false; } + return true; }, [&]() { if (folly::Random::oneIn(10)) { @@ -3229,7 +3633,9 @@ TEST_F(MockSharedArbitrationTest, concurrentArbitrationWithTransientRoots) { [&](MemoryPool* /*unused*/, uint64_t /*unused*/) { if (folly::Random::oneIn(10)) { VELOX_FAIL(injectReclaimErrorMessage); + return false; } + return true; }, [&]() { if (folly::Random::oneIn(10)) { diff --git a/velox/common/memory/tests/SharedArbitratorTest.cpp b/velox/common/memory/tests/SharedArbitratorTest.cpp index bd25c27e4c1f..d1bfe1f4cc0c 100644 --- a/velox/common/memory/tests/SharedArbitratorTest.cpp +++ b/velox/common/memory/tests/SharedArbitratorTest.cpp @@ -264,7 +264,6 @@ class SharedArbitrationTest : public testing::WithParamInterface, fuzzerOpts_.stringLength = 1024; fuzzerOpts_.allowLazyVector = false; vector_ = makeRowVector(rowType_, fuzzerOpts_); - numAddedPools_ = 0; isSerialExecutionMode_ = GetParam().isSerialExecutionMode; if (isSerialExecutionMode_) { executor_ = nullptr; @@ -286,7 +285,6 @@ class SharedArbitrationTest : public testing::WithParamInterface, createMemoryManager(memoryCapacity, memoryPoolInitCapacity); ASSERT_EQ(memoryManager_->arbitrator()->kind(), "SHARED"); arbitrator_ = static_cast(memoryManager_->arbitrator()); - numAddedPools_ = 0; } void checkOperatorStatsForArbitration( @@ -294,10 +292,12 @@ class SharedArbitrationTest : public testing::WithParamInterface, bool expectGlobalArbitration) { if (expectGlobalArbitration) { VELOX_CHECK_EQ( - stats.customStats.count(SharedArbitrator::kGlobalArbitrationCount), + stats.customStats.count( + SharedArbitrator::kGlobalArbitrationWaitCount), 1); VELOX_CHECK_GE( - stats.customStats.at(SharedArbitrator::kGlobalArbitrationCount).sum, + stats.customStats.at(SharedArbitrator::kGlobalArbitrationWaitCount) + .sum, 1); VELOX_CHECK_EQ( stats.customStats.count(SharedArbitrator::kLocalArbitrationCount), 0); @@ -308,7 +308,8 @@ class SharedArbitrationTest : public testing::WithParamInterface, stats.customStats.at(SharedArbitrator::kLocalArbitrationCount).sum, 1); VELOX_CHECK_EQ( - stats.customStats.count(SharedArbitrator::kGlobalArbitrationCount), + stats.customStats.count( + SharedArbitrator::kGlobalArbitrationWaitCount), 0); } } @@ -331,7 +332,6 @@ class SharedArbitrationTest : public testing::WithParamInterface, RowTypePtr rowType_; VectorFuzzer::Options fuzzerOpts_; RowVectorPtr vector_; - std::atomic_uint64_t numAddedPools_{0}; bool isSerialExecutionMode_{false}; }; @@ -507,7 +507,7 @@ DEBUG_ONLY_TEST_P( }); while (!blockedPartialAggregation || !blockedAggregation) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // NOLINT } testingRunArbitration(); @@ -540,14 +540,12 @@ DEBUG_ONLY_TEST_P(SharedArbitrationTestWithThreadingModes, reclaimToOrderBy) { const auto oldStats = arbitrator_->stats(); std::shared_ptr fakeMemoryQueryCtx = newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); - ++numAddedPools_; std::shared_ptr orderByQueryCtx; if (sameQuery) { orderByQueryCtx = fakeMemoryQueryCtx; } else { orderByQueryCtx = newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); - ++numAddedPools_; } folly::EventCount orderByWait; @@ -624,7 +622,6 @@ DEBUG_ONLY_TEST_P(SharedArbitrationTestWithThreadingModes, reclaimToOrderBy) { waitForAllTasksToBeDeleted(); const auto newStats = arbitrator_->stats(); ASSERT_GT(newStats.reclaimedUsedBytes, oldStats.reclaimedUsedBytes); - ASSERT_GT(orderByQueryCtx->pool()->stats().numCapacityGrowths, 0); } } @@ -644,14 +641,12 @@ DEBUG_ONLY_TEST_P( const auto oldStats = arbitrator_->stats(); std::shared_ptr fakeMemoryQueryCtx = newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); - ++numAddedPools_; std::shared_ptr aggregationQueryCtx; if (sameQuery) { aggregationQueryCtx = fakeMemoryQueryCtx; } else { aggregationQueryCtx = newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); - ++numAddedPools_; } folly::EventCount aggregationWait; @@ -748,14 +743,12 @@ DEBUG_ONLY_TEST_P( const auto oldStats = arbitrator_->stats(); std::shared_ptr fakeMemoryQueryCtx = newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); - ++numAddedPools_; std::shared_ptr joinQueryCtx; if (sameQuery) { joinQueryCtx = fakeMemoryQueryCtx; } else { joinQueryCtx = newQueryCtx(memoryManager_.get(), executor_.get(), kMemoryCapacity); - ++numAddedPools_; } folly::EventCount joinWait; diff --git a/velox/common/memory/tests/SharedArbitratorTestUtil.h b/velox/common/memory/tests/SharedArbitratorTestUtil.h new file mode 100644 index 000000000000..3d8b6062d847 --- /dev/null +++ b/velox/common/memory/tests/SharedArbitratorTestUtil.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/memory/ArbitrationParticipant.h" +#include "velox/common/memory/SharedArbitrator.h" + +namespace facebook::velox::memory::test { + +class SharedArbitratorTestHelper { + public: + explicit SharedArbitratorTestHelper(SharedArbitrator* arbitrator) + : arbitrator_(arbitrator) {} + + ScopedArbitrationParticipant getParticipant(const std::string& name) const { + return arbitrator_->getParticipant(name).value(); + } + + size_t numGlobalArbitrationWaiters() const { + std::lock_guard l(arbitrator_->stateLock_); + return arbitrator_->globalArbitrationWaiters_.size(); + } + + bool globalArbitrationRunning() const { + std::lock_guard l(arbitrator_->stateLock_); + return arbitrator_->globalArbitrationRunning_; + } + + void waitForGlobalArbitrationToFinish() const { + while (globalArbitrationRunning()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // NOLINT + } + } + + uint64_t maxArbitrationTimeMs() const { + return arbitrator_->maxArbitrationTimeMs_; + } + + folly::CPUThreadPoolExecutor* globalArbitrationExecutor() const { + return arbitrator_->globalArbitrationExecutor_.get(); + } + + std::thread* globalArbitrationController() const { + return arbitrator_->globalArbitrationController_.get(); + } + + private: + SharedArbitrator* const arbitrator_; +}; + +class ArbitrationParticipantTestHelper { + public: + explicit ArbitrationParticipantTestHelper(ArbitrationParticipant* participant) + : participant_(participant) {} + + size_t numOps() const { + std::lock_guard l(participant_->stateLock_); + return !!(participant_->runningOp_ != nullptr) + + participant_->waitOps_.size(); + } + + ArbitrationOperation* runningOp() const { + std::lock_guard l(participant_->stateLock_); + return participant_->runningOp_; + } + + std::vector waitingOps() const { + std::vector ops; + std::lock_guard l(participant_->stateLock_); + ops.reserve(participant_->waitOps_.size()); + for (const auto& waitOp : participant_->waitOps_) { + ops.push_back(waitOp.op); + } + return ops; + } + + private: + ArbitrationParticipant* const participant_; +}; +} // namespace facebook::velox::memory::test diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index 1b168b56304f..a73396528707 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -1276,7 +1276,7 @@ class PartitionedOutputNode : public PlanNode { partitionFunctionSpec_(std::move(partitionFunctionSpec)), outputType_(std::move(outputType)) { VELOX_USER_CHECK_GT(numPartitions, 0); - if (numPartitions == 1) { + if (numPartitions_ == 1) { VELOX_USER_CHECK( keys_.empty(), "Non-empty partitioning keys require more than one partition"); diff --git a/velox/docs/monitoring/metrics.rst b/velox/docs/monitoring/metrics.rst index b61d95d43c7a..4d80a25229ac 100644 --- a/velox/docs/monitoring/metrics.rst +++ b/velox/docs/monitoring/metrics.rst @@ -156,6 +156,14 @@ Memory Management initiate the memory arbitration request. This indicates the velox runtime doesn't have enough memory to run all the queries at their peak memory usage. We have to trigger spilling to let them run through completion. + * - arbitrator_global_arbitration_num_reclaim_victims + - Histogram + - The distribution of the number of query memory pools selected to reclaim memory by one + global memory arbitration round in range of [0, 32] with 32 buckets. It is configured to + report latency at P50, P90, P99, and P100 percentiles. + * - arbitrator_global_arbitration_failed_victim_count + - Count + - The number of victim query memory pool having nothing to spill. * - arbitrator_aborted_count - Count - The number of times a query level memory pool is aborted as a result of @@ -549,5 +557,3 @@ Hive Connector - The distribution of hive sort writer finish processing time slice in range of[0, 120s] with 60 buckets. It is configured to report latency at P50, P90, P99, and P100 percentiles. - - diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index c8acf566e086..85cba2b9d893 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -1675,7 +1675,7 @@ int Task::getOutputPipelineId() const { } } - VELOX_FAIL("Output pipeline not found"); + VELOX_FAIL("Output pipeline not found for task {}", taskId_); } void Task::setAllOutputConsumed() { diff --git a/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp b/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp index 86290b6717e0..b6eb0634c486 100644 --- a/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp +++ b/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp @@ -22,7 +22,9 @@ #include "velox/common/memory/SharedArbitrator.h" #include "velox/connectors/hive/HiveConnector.h" #include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/dwio/dwrf/RegisterDwrfReader.h" #include "velox/dwio/dwrf/RegisterDwrfReader.h" // @manual +#include "velox/dwio/dwrf/RegisterDwrfWriter.h" #include "velox/dwio/dwrf/RegisterDwrfWriter.h" // @manual #include "velox/exec/MemoryReclaimer.h" #include "velox/exec/TableWriter.h" @@ -92,8 +94,8 @@ class MemoryArbitrationFuzzer { void print() const { std::stringstream ss; - ss << "Success count = " << successCount << ". OOM count = " << oomCount - << " Abort count = " << abortCount; + ss << "success count = " << successCount << ", oom count = " << oomCount + << ", abort count = " << abortCount; LOG(INFO) << ss.str(); } }; @@ -233,6 +235,7 @@ MemoryArbitrationFuzzer::MemoryArbitrationFuzzer(size_t initialSeed) connector::registerConnector(hiveConnector); dwrf::registerDwrfReaderFactory(); dwrf::registerDwrfWriterFactory(); + seed(initialSeed); } @@ -718,7 +721,7 @@ void MemoryArbitrationFuzzer::verify() { } else if (e.errorCode() == error_code::kMemAborted.c_str()) { ++lockedStats->abortCount; } else { - LOG(ERROR) << "Unexpected exception: " << e.what(); + LOG(ERROR) << "Unexpected exception:\n" << e.what(); std::rethrow_exception(std::current_exception()); } } diff --git a/velox/exec/tests/AggregationTest.cpp b/velox/exec/tests/AggregationTest.cpp index cf15d69d06c1..c48e01fb518b 100644 --- a/velox/exec/tests/AggregationTest.cpp +++ b/velox/exec/tests/AggregationTest.cpp @@ -21,6 +21,7 @@ #include "folly/experimental/EventCount.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/file/FileSystems.h" +#include "velox/common/memory/SharedArbitrator.h" #include "velox/common/testutil/TestValue.h" #include "velox/dwio/common/tests/utils/BatchMaker.h" #include "velox/exec/Aggregate.h" @@ -3065,7 +3066,8 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimEmptyInput) { auto* driver = values->testingOperatorCtx()->driver(); auto task = values->testingOperatorCtx()->task(); // Shrink all the capacity before reclaim. - memory::memoryManager()->arbitrator()->shrinkCapacity(task->pool(), 0); + memory::memoryManager()->arbitrator()->shrinkCapacity( + task->pool()->root(), 0); { MemoryReclaimer::Stats stats; SuspendedSection suspendedSection(driver); @@ -3134,7 +3136,8 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimEmptyOutput) { auto* driver = op->testingOperatorCtx()->driver(); auto task = op->testingOperatorCtx()->task(); // Shrink all the capacity before reclaim. - memory::memoryManager()->arbitrator()->shrinkCapacity(task->pool(), 0); + memory::memoryManager()->arbitrator()->shrinkCapacity( + task->pool()->root(), 0); { MemoryReclaimer::Stats stats; SuspendedSection suspendedSection(driver); @@ -3264,7 +3267,13 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromAggregation) { auto taskStats = exec::toPlanStats(task->taskStats()); auto& planStats = taskStats.at(aggrNodeId); ASSERT_GT(planStats.spilledBytes, 0); - ASSERT_GT(planStats.customStats["memoryArbitrationWallNanos"].sum, 0); + // The actual ime resolution is millisecond so we might see zero nanos + // reporting in unit test. + ASSERT_GE( + planStats + .customStats[memory::SharedArbitrator::kMemoryArbitrationWallNanos] + .sum, + 0); task.reset(); waitForAllTasksToBeDeleted(); } @@ -3318,81 +3327,38 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromDistinctAggregation) { DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromAggregationOnNoMoreInput) { std::vector vectors = createVectors(8, rowType_, fuzzerOpts_); createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeQueryCtx = - core::QueryCtx::create(executor_.get()); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeQueryCtx; - } else { - aggregationQueryCtx = core::QueryCtx::create(executor_.get()); - } - - folly::EventCount arbitrationWait; - std::atomic_bool arbitrationWaitFlag{true}; - folly::EventCount taskPauseWait; - std::atomic_bool taskPauseWaitFlag{true}; - std::atomic injectedPool{nullptr}; - - std::atomic injectNoMoreInputOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::noMoreInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "Aggregation") { - return; - } - - if (!injectNoMoreInputOnce.exchange(false)) { - return; - } - - injectedPool = op->pool(); - arbitrationWaitFlag = false; - arbitrationWait.notifyAll(); + const auto spillDirectory = exec::test::TempDirectoryPath::create(); - // Wait for task pause to be triggered. - taskPauseWait.await([&] { return !taskPauseWaitFlag.load(); }); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskPauseWaitFlag = false; - taskPauseWait.notifyAll(); - }))); - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kAggregationSpillEnabled, true) - .queryCtx(aggregationQueryCtx) - .maxDrivers(1) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); - }); - - arbitrationWait.await([&] { return !arbitrationWaitFlag.load(); }); - ASSERT_TRUE(injectedPool != nullptr); - - auto fakePool = fakeQueryCtx->pool()->addLeafChild( - "fakePool", true, exec::MemoryReclaimer::create()); - fakePool->maybeReserve(memory::memoryManager()->arbitrator()->capacity()); - - aggregationThread.join(); + std::atomic injectNoMoreInputOnce{true}; + SCOPED_TESTVALUE_SET( + "facebook::velox::exec::Driver::runInternal::noMoreInput", + std::function(([&](Operator* op) { + if (op->operatorType() != "Aggregation") { + return; + } + if (!injectNoMoreInputOnce.exchange(false)) { + return; + } + testingRunArbitration(op->pool()); + }))); - waitForAllTasksToBeDeleted(); + { + auto task = + AssertQueryBuilder(duckDbQueryRunner_) + .spillDirectory(spillDirectory->getPath()) + .config(core::QueryConfig::kSpillEnabled, true) + .config(core::QueryConfig::kAggregationSpillEnabled, true) + .maxDrivers(1) + .plan(PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .planNode()) + .assertResults( + "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); + auto stats = task->taskStats().pipelineStats; + ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); } + waitForAllTasksToBeDeleted(); } DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromAggregationDuringOutput) { @@ -3406,124 +3372,65 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromAggregationDuringOutput) { } createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeQueryCtx = - core::QueryCtx::create(executor_.get()); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeQueryCtx; - } else { - aggregationQueryCtx = core::QueryCtx::create(executor_.get()); - } - - folly::EventCount arbitrationWait; - std::atomic_bool arbitrationWaitFlag{true}; - folly::EventCount taskPauseWait; - std::atomic_bool taskPauseWaitFlag{true}; - - std::atomic_int numInputs{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::getOutput", - std::function(([&](Operator* op) { - if (op->operatorType() != "Aggregation") { - return; - } - if (++numInputs != 5) { - return; - } - arbitrationWaitFlag = false; - arbitrationWait.notifyAll(); - - // Wait for task pause to be triggered. - taskPauseWait.await([&] { return !taskPauseWaitFlag.load(); }); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskPauseWaitFlag = false; - taskPauseWait.notifyAll(); - }))); - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kAggregationSpillEnabled, true) - .config( - core::QueryConfig::kPreferredOutputBatchRows, numRows / 10) - .maxDrivers(1) - .queryCtx(aggregationQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - auto stats = task->taskStats().pipelineStats; - ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); - }); - - arbitrationWait.await([&] { return !arbitrationWaitFlag.load(); }); - - auto fakePool = fakeQueryCtx->pool()->addLeafChild( - "fakePool", true, exec::MemoryReclaimer::create()); - fakePool->maybeReserve(memory::memoryManager()->arbitrator()->capacity()); - - aggregationThread.join(); - - waitForAllTasksToBeDeleted(); + const auto spillDirectory = exec::test::TempDirectoryPath::create(); + std::atomic_int numInputs{0}; + SCOPED_TESTVALUE_SET( + "facebook::velox::exec::Driver::runInternal::getOutput", + std::function(([&](Operator* op) { + if (op->operatorType() != "Aggregation") { + return; + } + if (++numInputs != 5) { + return; + } + testingRunArbitration(op->pool()); + }))); + { + auto task = + AssertQueryBuilder(duckDbQueryRunner_) + .spillDirectory(spillDirectory->getPath()) + .config(core::QueryConfig::kSpillEnabled, true) + .config(core::QueryConfig::kAggregationSpillEnabled, true) + .config(core::QueryConfig::kPreferredOutputBatchRows, numRows / 10) + .maxDrivers(1) + //.queryCtx(aggregationQueryCtx) + .plan(PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .planNode()) + .assertResults( + "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); + auto stats = task->taskStats().pipelineStats; + ASSERT_GT(stats[0].operatorStats[1].spilledBytes, 0); } + waitForAllTasksToBeDeleted(); } TEST_F(AggregationTest, reclaimFromCompletedAggregation) { std::vector vectors = createVectors(8, rowType_, fuzzerOpts_); createDuckDbTable(vectors); - std::vector sameQueries = {false, true}; - for (bool sameQuery : sameQueries) { - SCOPED_TRACE(fmt::format("sameQuery {}", sameQuery)); - const auto spillDirectory = exec::test::TempDirectoryPath::create(); - std::shared_ptr fakeQueryCtx = - core::QueryCtx::create(executor_.get()); - std::shared_ptr aggregationQueryCtx; - if (sameQuery) { - aggregationQueryCtx = fakeQueryCtx; - } else { - aggregationQueryCtx = core::QueryCtx::create(executor_.get()); - } - - folly::EventCount arbitrationWait; - std::atomic_bool arbitrationWaitFlag{true}; - - std::thread aggregationThread([&]() { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(aggregationQueryCtx) - .plan(PlanBuilder() - .values(vectors) - .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) - .planNode()) - .assertResults( - "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); - waitForTaskCompletion(task.get()); - arbitrationWaitFlag = false; - arbitrationWait.notifyAll(); - }); - - arbitrationWait.await([&] { return !arbitrationWaitFlag.load(); }); - - auto fakePool = fakeQueryCtx->pool()->addLeafChild( - "fakePool", true, exec::MemoryReclaimer::create()); - fakePool->maybeReserve(memory::memoryManager()->arbitrator()->capacity()); + const auto spillDirectory = exec::test::TempDirectoryPath::create(); - aggregationThread.join(); + folly::EventCount arbitrationWait; + std::atomic_bool arbitrationWaitFlag{true}; + std::thread aggregationThread([&]() { + auto task = + AssertQueryBuilder(duckDbQueryRunner_) + .plan(PlanBuilder() + .values(vectors) + .singleAggregation({"c0", "c1"}, {"array_agg(c2)"}) + .planNode()) + .assertResults( + "SELECT c0, c1, array_agg(c2) FROM tmp GROUP BY c0, c1"); + waitForTaskCompletion(task.get()); + arbitrationWaitFlag = false; + arbitrationWait.notifyAll(); + }); + arbitrationWait.await([&] { return !arbitrationWaitFlag.load(); }); - waitForAllTasksToBeDeleted(); - } + memory::testingRunArbitration(); + aggregationThread.join(); + waitForAllTasksToBeDeleted(); } TEST_F(AggregationTest, ignoreNullKeys) { diff --git a/velox/exec/tests/HashJoinTest.cpp b/velox/exec/tests/HashJoinTest.cpp index b612afc31928..123b145bb253 100644 --- a/velox/exec/tests/HashJoinTest.cpp +++ b/velox/exec/tests/HashJoinTest.cpp @@ -7516,8 +7516,9 @@ DEBUG_ONLY_TEST_F(HashJoinTest, taskWaitTimeout) { const auto expectedResult = runHashJoinTask(vectors, nullptr, false, numDrivers, pool(), false).data; - for (uint64_t timeoutMs : {0, 1'000, 30'000}) { + for (uint64_t timeoutMs : {1'000, 30'000}) { SCOPED_TRACE(fmt::format("timeout {}", succinctMillis(timeoutMs))); + LOG(ERROR) << "timeout " << succinctMillis(timeoutMs); auto memoryManager = createMemoryManager(512 << 20, 0, timeoutMs); auto queryCtx = newQueryCtx(memoryManager.get(), executor_.get(), queryMemoryCapacity); diff --git a/velox/exec/tests/utils/ArbitratorTestUtil.h b/velox/exec/tests/utils/ArbitratorTestUtil.h index 3c1cb6191b98..82f034edd7a0 100644 --- a/velox/exec/tests/utils/ArbitratorTestUtil.h +++ b/velox/exec/tests/utils/ArbitratorTestUtil.h @@ -94,7 +94,7 @@ std::shared_ptr newQueryCtx( std::unique_ptr createMemoryManager( int64_t arbitratorCapacity = kMemoryCapacity, uint64_t memoryPoolInitCapacity = kMemoryPoolInitCapacity, - uint64_t maxReclaimWaitMs = 0, + uint64_t maxReclaimWaitMs = 5 * 60 * 1'000, uint64_t fastExponentialGrowthCapacityLimit = 0, double slowCapacityGrowPct = 0); diff --git a/velox/exec/tests/utils/OperatorTestBase.cpp b/velox/exec/tests/utils/OperatorTestBase.cpp index a29468e03f2f..0fbf9017aa63 100644 --- a/velox/exec/tests/utils/OperatorTestBase.cpp +++ b/velox/exec/tests/utils/OperatorTestBase.cpp @@ -82,7 +82,9 @@ void OperatorTestBase::setupMemory( int64_t arbitratorCapacity, int64_t arbitratorReservedCapacity, int64_t memoryPoolInitCapacity, - int64_t memoryPoolReservedCapacity) { + int64_t memoryPoolReservedCapacity, + int64_t memoryPoolMinReclaimBytes, + int64_t memoryPoolAbortCapacityLimit) { if (asyncDataCache_ != nullptr) { asyncDataCache_->clear(); asyncDataCache_.reset(); @@ -102,6 +104,10 @@ void OperatorTestBase::setupMemory( folly::to(memoryPoolInitCapacity) + "B"}, {std::string(ExtraConfig::kMemoryPoolReservedCapacity), folly::to(memoryPoolReservedCapacity) + "B"}, + {std::string(ExtraConfig::kMemoryPoolMinReclaimBytes), + folly::to(memoryPoolMinReclaimBytes) + "B"}, + {std::string(ExtraConfig::kMemoryPoolAbortCapacityLimit), + folly::to(memoryPoolAbortCapacityLimit) + "B"}, {std::string(ExtraConfig::kGlobalArbitrationEnabled), "true"}, }; @@ -112,7 +118,7 @@ void OperatorTestBase::setupMemory( } void OperatorTestBase::resetMemory() { - OperatorTestBase::setupMemory(8L << 30, 6L << 30, 0, 512 << 20, 0); + OperatorTestBase::setupMemory(8L << 30, 6L << 30, 0, 512 << 20, 0, 0, 0); } void OperatorTestBase::SetUp() { diff --git a/velox/exec/tests/utils/OperatorTestBase.h b/velox/exec/tests/utils/OperatorTestBase.h index 15141fbc0938..4e5def101cfa 100644 --- a/velox/exec/tests/utils/OperatorTestBase.h +++ b/velox/exec/tests/utils/OperatorTestBase.h @@ -49,7 +49,9 @@ class OperatorTestBase : public testing::Test, int64_t arbitratorCapacity, int64_t arbitratorReservedCapacity, int64_t memoryPoolInitCapacity, - int64_t memoryPoolReservedCapacity); + int64_t memoryPoolReservedCapacity, + int64_t memoryPoolMinReclaimBytes, + int64_t memoryPoolAbortCapacityLimit); static void resetMemory();