Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a perf level for measuring user thread block time #12368

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions db/db_impl/db_impl_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,7 @@ Status DBImpl::WriteImplWALOnly(
}
} else {
PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time);
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
InstrumentedMutexLock lock(&mutex_);
Status status =
DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options);
Expand Down Expand Up @@ -1228,7 +1228,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
write_controller_.NeedsDelay()))) {
PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time);
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
// We don't know size of curent batch so that we always use the size
// for previous one. It might create a fairness issue that expiration
// might happen for smaller writes but larger writes can go through.
Expand Down Expand Up @@ -2000,7 +2000,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
// is that in case the write is heavy, low pri writes may never have
// a chance to run. Now we guarantee we are still slowly making
// progress.
PERF_TIMER_GUARD(write_delay_time);
PERF_TIMER_FOR_WAIT_GUARD(write_delay_time);
auto data_size = my_batch->GetDataSize();
while (data_size > 0) {
size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken(
Expand Down
41 changes: 41 additions & 0 deletions db/db_write_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,47 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}

TEST_P(DBWriteTest, WriteThreadWaitNanosCounter) {
Options options = GetOptions();
std::vector<port::Thread> threads;

Reopen(options);

std::function<void()> write_func = [&]() {
PerfContext* perf_ctx = get_perf_context();
SetPerfLevel(PerfLevel::kEnableWait);
perf_ctx->Reset();
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc");
ASSERT_OK(dbfull()->Put(WriteOptions(), "bar", "val2"));
ASSERT_GT(perf_ctx->write_thread_wait_nanos, 1000000000U);
};

std::function<void()> sleep_func = [&]() {
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1");
sleep(2);
TEST_SYNC_POINT("DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2");
};

ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
{{"WriteThread::EnterAsBatchGroupLeader:End",
"DBWriteTest::WriteThreadWaitNanosCounter:WriteFunc"},
{"WriteThread::AwaitState:BlockingWaiting",
"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:1"},
{"DBWriteTest::WriteThreadWaitNanosCounter:SleepFunc:2",
"WriteThread::ExitAsBatchGroupLeader:Start"}});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

threads.emplace_back(sleep_func);
threads.emplace_back(write_func);

ASSERT_OK(dbfull()->Put(WriteOptions(), "foo", "val1"));

for (auto& t : threads) {
t.join();
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
constexpr int kNumThreads = 5;
std::unique_ptr<FaultInjectionTestEnv> mock_env(
Expand Down
2 changes: 1 addition & 1 deletion db/write_thread.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,

// This is below the fast path, so that the stat is zero when all writes are
// from the same thread.
PERF_TIMER_GUARD(write_thread_wait_nanos);
PERF_TIMER_FOR_WAIT_GUARD(write_thread_wait_nanos);

// If we're only going to end up waiting a short period of time,
// it can be a lot more efficient to call std::this_thread::yield()
Expand Down
11 changes: 7 additions & 4 deletions include/rocksdb/perf_level.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@ enum PerfLevel : unsigned char {
kUninitialized = 0, // unknown setting
kDisable = 1, // disable perf stats
kEnableCount = 2, // enable only count stats
kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time
kEnableWait = 3, // measure time spent by user threads
// blocked in RocksDB, and not external
// resources such as mutexes and IO
kEnableTimeExceptForMutex = 4, // Other than count stats, also enable time
// stats except for mutexes
// Other than time, also measure CPU time counters. Still don't measure
// time (neither wall time nor CPU time) for mutexes.
kEnableTimeAndCPUTimeExceptForMutex = 4,
kEnableTime = 5, // enable count and time stats
kOutOfBounds = 6 // N.B. Must always be the last value!
kEnableTimeAndCPUTimeExceptForMutex = 5,
kEnableTime = 6, // enable count and time stats
kOutOfBounds = 7 // N.B. Must always be the last value!
};

// set the perf stats level for current thread
Expand Down
6 changes: 6 additions & 0 deletions monitoring/perf_context_imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ extern thread_local PerfContext perf_context;
#define PERF_CPU_TIMER_GUARD(metric, clock)
#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
ticker_type)
#define PERF_TIMER_FOR_WAIT_GUARD(metric)
#define PERF_TIMER_MEASURE(metric)
#define PERF_COUNTER_ADD(metric, value)
#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
Expand Down Expand Up @@ -66,6 +67,11 @@ extern thread_local PerfContext perf_context;
perf_step_timer_##metric.Start(); \
}

#define PERF_TIMER_FOR_WAIT_GUARD(metric) \
PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \
false, PerfLevel::kEnableWait); \
perf_step_timer_##metric.Start();

// Update metric with time elapsed since last START. start time is reset
// to current timestamp.
#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Defined a new PerfLevel `kEnableWait` to measure time spent by user threads blocked in RocksDB other than mutex, such as a write thread waiting to be added to a write group, a write thread delayed or stalled etc.
Loading