Skip to content

Commit 22529db

Browse files
romangekostasrim
authored andcommitted
chore: Make snapshotting more responsive (#4910)
* chore: Make snapshotting more responsive This should improve situation around #4787 - maybe not solve it completely but improve it significantly. On my tests when doing snapshotting under read traffic with master (memtier_benchmark --ratio 0:1 -d 256 --test-time=400 --distinct-client-seed --key-maximum=2000000 -c 5 -t 2 --pipeline=3) I got drop from 250K qps to 8K qps during the full sync phase. With this PR, the throughput went up to 70-80K qps. --------- Signed-off-by: Roman Gershman <roman@dragonflydb.io>
1 parent f147b37 commit 22529db

File tree

8 files changed

+51
-96
lines changed

8 files changed

+51
-96
lines changed

src/server/journal/journal.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
namespace dfly {
1515
namespace journal {
1616

17-
namespace fs = std::filesystem;
1817
using namespace std;
1918
using namespace util;
2019

src/server/journal/journal_slice.cc

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,38 +23,11 @@ namespace dfly {
2323
namespace journal {
2424
using namespace std;
2525
using namespace util;
26-
namespace fs = std::filesystem;
27-
28-
namespace {
29-
30-
/*
31-
string ShardName(std::string_view base, unsigned index) {
32-
return absl::StrCat(base, "-", absl::Dec(index, absl::kZeroPad4), ".log");
33-
}
34-
35-
uint32_t NextPowerOf2(uint32_t x) {
36-
if (x < 2) {
37-
return 1;
38-
}
39-
int log = 32 - __builtin_clz(x - 1);
40-
return 1 << log;
41-
}
42-
43-
*/
44-
45-
} // namespace
46-
47-
#define CHECK_EC(x) \
48-
do { \
49-
auto __ec$ = (x); \
50-
CHECK(!__ec$) << "Error: " << __ec$ << " " << __ec$.message() << " for " << #x; \
51-
} while (false)
5226

5327
JournalSlice::JournalSlice() {
5428
}
5529

5630
JournalSlice::~JournalSlice() {
57-
// CHECK(!shard_file_);
5831
}
5932

6033
void JournalSlice::Init(unsigned index) {
@@ -175,14 +148,6 @@ void JournalSlice::AddLogRecord(const Entry& entry) {
175148
VLOG(2) << "Writing item [" << item.lsn << "]: " << entry.ToString();
176149
}
177150

178-
#if 0
179-
if (shard_file_) {
180-
string line = absl::StrCat(item.lsn, " ", entry.txid, " ", entry.opcode, "\n");
181-
error_code ec = shard_file_->Write(io::Buffer(line), file_offset_, 0);
182-
CHECK_EC(ec);
183-
file_offset_ += line.size();
184-
}
185-
#endif
186151
CallOnChange(item);
187152
}
188153

src/server/rdb_save.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ error_code RdbSerializer::SaveListObject(const PrimeValue& pv) {
383383
<< "/" << node->sz;
384384

385385
// Use listpack encoding
386-
SaveLen(node->container);
386+
RETURN_ON_ERR(SaveLen(node->container));
387387
if (quicklistNodeIsCompressed(node)) {
388388
void* data;
389389
size_t compress_len = quicklistGetLzf(node, &data);
@@ -910,7 +910,7 @@ size_t SerializerBase::SerializedLen() const {
910910
io::Bytes SerializerBase::PrepareFlush(SerializerBase::FlushState flush_state) {
911911
size_t sz = mem_buf_.InputLen();
912912
if (sz == 0)
913-
return mem_buf_.InputBuffer();
913+
return {};
914914

915915
bool is_last_chunk = flush_state == FlushState::kFlushEndEntry;
916916
VLOG(2) << "PrepareFlush:" << is_last_chunk << " " << number_of_chunks_;

src/server/snapshot.cc

Lines changed: 45 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
#include <absl/strings/match.h>
88
#include <absl/strings/str_cat.h>
99

10-
#include <mutex>
11-
10+
#include "base/cycle_clock.h"
1211
#include "base/flags.h"
1312
#include "base/logging.h"
1413
#include "core/heap_size.h"
@@ -27,12 +26,13 @@ using namespace std;
2726
using namespace util;
2827
using namespace chrono_literals;
2928

30-
using facade::operator""_MB;
3129
using facade::operator""_KB;
3230
namespace {
3331
thread_local absl::flat_hash_set<SliceSnapshot*> tl_slice_snapshots;
3432

35-
constexpr size_t kMinBlobSize = 32_KB;
33+
// Controls the chunks size for pushing serialized data. The larger the chunk the more CPU
34+
// it may require (especially with compression), and less responsive the server may be.
35+
constexpr size_t kMinBlobSize = 8_KB;
3636

3737
} // namespace
3838

@@ -98,7 +98,8 @@ void SliceSnapshot::Start(bool stream_journal, SnapshotFlush allow_flush) {
9898

9999
VLOG(1) << "DbSaver::Start - saving entries with version less than " << snapshot_version_;
100100

101-
snapshot_fb_ = fb2::Fiber("snapshot", [this, stream_journal] {
101+
string fb_name = absl::StrCat("SliceSnapshot-", ProactorBase::me()->GetPoolIndex());
102+
snapshot_fb_ = fb2::Fiber(fb_name, [this, stream_journal] {
102103
this->IterateBucketsFb(stream_journal);
103104
db_slice_->UnregisterOnChange(snapshot_version_);
104105
consumer_->Finalize();
@@ -114,7 +115,7 @@ void SliceSnapshot::StartIncremental(LSN start_lsn) {
114115

115116
// Called only for replication use-case.
116117
void SliceSnapshot::FinalizeJournalStream(bool cancel) {
117-
VLOG(1) << "Finalize Snapshot";
118+
VLOG(1) << "FinalizeJournalStream";
118119
DCHECK(db_slice_->shard_owner()->IsMyThread());
119120
if (!journal_cb_id_) { // Finalize only once.
120121
return;
@@ -129,7 +130,8 @@ void SliceSnapshot::FinalizeJournalStream(bool cancel) {
129130

130131
journal->UnregisterOnChange(cb_id);
131132
if (!cancel) {
132-
serializer_->SendJournalOffset(journal->GetLsn());
133+
// always succeeds because serializer_ flushes to string.
134+
std::ignore = serializer_->SendJournalOffset(journal->GetLsn());
133135
PushSerialized(true);
134136
}
135137
}
@@ -147,27 +149,23 @@ void SliceSnapshot::FinalizeJournalStream(bool cancel) {
147149

148150
// Serializes all the entries with version less than snapshot_version_.
149151
void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
150-
{
151-
auto fiber_name = absl::StrCat("SliceSnapshot-", ProactorBase::me()->GetPoolIndex());
152-
ThisFiber::SetName(std::move(fiber_name));
153-
}
154-
155152
PrimeTable::Cursor cursor;
156153
for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
157154
stats_.keys_total += db_slice_->DbSize(db_indx);
158155
}
159156

157+
const uint64_t kCyclesPerJiffy = base::CycleClock::Frequency() >> 16; // ~15usec.
158+
160159
for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
161160
if (!cntx_->IsRunning())
162161
return;
163162

164163
if (!db_array_[db_indx])
165164
continue;
166165

167-
uint64_t last_yield = 0;
168166
PrimeTable* pt = &db_array_[db_indx]->prime;
169-
170167
VLOG(1) << "Start traversing " << pt->size() << " items for index " << db_indx;
168+
171169
do {
172170
if (!cntx_->IsRunning()) {
173171
return;
@@ -176,17 +174,13 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
176174
PrimeTable::Cursor next = pt->TraverseBuckets(
177175
cursor, [this, &db_indx](auto it) { return BucketSaveCb(db_indx, it); });
178176
cursor = next;
179-
PushSerialized(false);
180-
181-
if (stats_.loop_serialized >= last_yield + 100) {
182-
DVLOG(2) << "Before sleep " << ThisFiber::GetName();
183-
ThisFiber::Yield();
184-
DVLOG(2) << "After sleep";
185177

186-
last_yield = stats_.loop_serialized;
187-
// Push in case other fibers (writes commands that pushed previous values)
188-
// filled the buffer.
189-
PushSerialized(false);
178+
// If we do not flush the data, and have not preempted,
179+
// we may need to yield to other fibers to avoid grabbing CPU for too long.
180+
if (!PushSerialized(false)) {
181+
if (ThisFiber::GetRunningTimeCycles() > kCyclesPerJiffy) {
182+
ThisFiber::Yield();
183+
}
190184
}
191185
} while (cursor);
192186

@@ -214,7 +208,7 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
214208

215209
// The replica sends the LSN of the next entry is wants to receive.
216210
while (cntx_->IsRunning() && journal->IsLSNInBuffer(lsn)) {
217-
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
211+
std::ignore = serializer_->WriteJournalEntry(journal->GetEntry(lsn));
218212
PushSerialized(false);
219213
lsn++;
220214
}
@@ -231,10 +225,8 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
231225

232226
// GetLsn() is always the next lsn that we expect to create.
233227
if (journal->GetLsn() == lsn) {
234-
{
235-
FiberAtomicGuard fg;
236-
serializer_->SendFullSyncCut();
237-
}
228+
std::ignore = serializer_->SendFullSyncCut();
229+
238230
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
239231
OnJournalEntry(item, await);
240232
};
@@ -255,29 +247,22 @@ bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator i
255247

256248
++stats_.savecb_calls;
257249

258-
auto check = [&](uint64_t v) {
259-
if (v >= snapshot_version_) {
260-
// either has been already serialized or added after snapshotting started.
261-
DVLOG(3) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << " at " << v;
262-
++stats_.skipped;
263-
return false;
264-
}
265-
return true;
266-
};
267-
268-
if (!check(it.GetVersion())) {
250+
if (it.GetVersion() >= snapshot_version_) {
251+
// either has been already serialized or added after snapshotting started.
252+
DVLOG(3) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << " at " << it.GetVersion();
253+
++stats_.skipped;
269254
return false;
270255
}
271256

272257
db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
273258
snapshot_version_);
274259

275-
auto* blocking_counter = db_slice_->GetLatch();
260+
auto* latch = db_slice_->GetLatch();
276261

277262
// Locking this never preempts. We merely just increment the underline counter such that
278263
// if SerializeBucket preempts, Heartbeat() won't run because the blocking counter is not
279264
// zero.
280-
std::lock_guard blocking_counter_guard(*blocking_counter);
265+
std::lock_guard latch_guard(*latch);
281266

282267
stats_.loop_serialized += SerializeBucket(db_index, it);
283268

@@ -324,7 +309,8 @@ void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const Pr
324309

325310
size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
326311
io::StringFile sfile;
327-
serializer_->FlushToSink(&sfile, flush_state);
312+
error_code ec = serializer_->FlushToSink(&sfile, flush_state);
313+
CHECK(!ec); // always succeeds
328314

329315
size_t serialized = sfile.val.size();
330316
if (serialized == 0)
@@ -333,6 +319,8 @@ size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
333319
uint64_t id = rec_id_++;
334320
DVLOG(2) << "Pushing " << id;
335321

322+
uint64_t running_cycles = ThisFiber::GetRunningTimeCycles();
323+
336324
fb2::NoOpLock lk;
337325

338326
// We create a critical section here that ensures that records are pushed in sequential order.
@@ -351,6 +339,12 @@ size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
351339

352340
VLOG(2) << "Pushed with Serialize() " << serialized;
353341

342+
// FlushToSink can be quite slow for large values or due compression, therefore
343+
// we counter-balance CPU over-usage by forcing sleep.
344+
// We measure running_cycles before the preemption points, because they reset the counter.
345+
uint64_t sleep_usec = (running_cycles * 1000'000 / base::CycleClock::Frequency()) / 2;
346+
ThisFiber::SleepFor(chrono::microseconds(std::min(sleep_usec, 2000ul)));
347+
354348
return serialized;
355349
}
356350

@@ -419,19 +413,19 @@ void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req)
419413
// value. This is guaranteed by the fact that OnJournalEntry runs always after OnDbChange, and
420414
// no database switch can be performed between those two calls, because they are part of one
421415
// transaction.
422-
void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await) {
423-
// To enable journal flushing to sync after non auto journal command is executed we call
424-
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
425-
// additional journal change to serialize, it simply invokes PushSerialized.
416+
// allow_flush is controlled by Journal::SetFlushMode
417+
// (usually it's true unless we are in the middle of a critical section that can not preempt).
418+
void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool allow_flush) {
426419
{
427-
// We should release the lock after we preempt
428-
std::lock_guard guard(big_value_mu_);
420+
// We grab the lock in case we are in the middle of serializing a bucket, so it serves as a
421+
// barrier here for atomic serialization.
422+
std::lock_guard barrier(big_value_mu_);
429423
if (item.opcode != journal::Op::NOOP) {
430-
serializer_->WriteJournalEntry(item.data);
424+
std::ignore = serializer_->WriteJournalEntry(item.data);
431425
}
432426
}
433427

434-
if (await) {
428+
if (allow_flush) {
435429
// This is the only place that flushes in streaming mode
436430
// once the iterate buckets fiber finished.
437431
PushSerialized(false);

src/server/snapshot.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ class SliceSnapshot {
125125
void OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req);
126126

127127
// Journal listener
128-
void OnJournalEntry(const journal::JournalItem& item, bool allow_await);
128+
void OnJournalEntry(const journal::JournalItem& item, bool allow_flush);
129129

130130
// Push serializer's internal buffer.
131131
// Push regardless of buffer size if force is true.

src/server/tx_base.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,4 @@ void RecordJournal(const OpArgs& op_args, std::string_view cmd, ArgSlice args,
227227
// Might block the calling fiber unless Journal::SetFlushMode(false) is called.
228228
void RecordExpiryBlocking(DbIndex dbid, std::string_view key);
229229

230-
// Trigger journal write to sink, no journal record will be added to journal.
231-
// Must be called from shard thread of journal to sink.
232-
void TriggerJournalWriteToSink();
233-
234230
} // namespace dfly

tests/dragonfly/replication_test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ async def check_all_replicas_finished(c_replicas, c_master, timeout=20):
167167
start = time.time()
168168
while (time.time() - start) < timeout:
169169
if not waiting_for:
170+
logging.debug("All replicas finished after %s seconds", time.time() - start)
170171
return
171172
await asyncio.sleep(0.2)
172173
m_offset = await c_master.execute_command("DFLY REPLICAOFFSET")
@@ -2715,7 +2716,7 @@ async def test_replication_timeout_on_full_sync_heartbeat_expiry(
27152716

27162717
await asyncio.sleep(1) # replica will start resync
27172718

2718-
await check_all_replicas_finished([c_replica], c_master)
2719+
await check_all_replicas_finished([c_replica], c_master, 60)
27192720
await assert_replica_reconnections(replica, 0)
27202721

27212722

0 commit comments

Comments
 (0)