From 45509f0f530ad370863876fc1ee95ccf85bfe96d Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 26 Mar 2024 11:13:55 +0800 Subject: [PATCH] Ensure writes to WAL tail during `FlushWAL(true /* sync */)` will be synced (#10560) (#357) Summary: WAL append and switch can both happen between `FlushWAL(true /* sync */)`'s sync operations and its call to `MarkLogsSynced()`. We permit this since locks need to be released for the sync operations. Such an appended/switched WAL is both inactive and incompletely synced at the time `MarkLogsSynced()` processes it. Prior to this PR, `MarkLogsSynced()` assumed all inactive WALs were fully synced and removed them from consideration for future syncs. That was wrong in the scenario described above and led to the latest append(s) never being synced. This PR changes `MarkLogsSynced()` to only remove inactive WALs from consideration for which all flushed data has been synced. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10560 Test Plan: repro unit test for the scenario described above. Without this PR, it fails on "key2" not found Reviewed By: riversand963 Differential Revision: D38957391 Pulled By: ajkr fbshipit-source-id: da77175eba97ff251a4219b227b3bb2d4843ed26 Co-authored-by: Andrew Kryczka --- HISTORY.md | 2 ++ db/db_impl/db_impl.cc | 18 ++++++++++++----- db/db_write_test.cc | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d6ec32e290d..8646f5c845f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,8 @@ # Rocksdb Change Log ## Unreleased +### Bug Fixes +* Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced. ### Bug Fixes diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 4c0798531e7..0c0df39b5b0 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1488,20 +1488,28 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, auto& wal = *it; assert(wal.IsSyncing()); - if (logs_.size() > 1) { + if (wal.number < logs_.back().number) { + // Inactive WAL if (immutable_db_options_.track_and_verify_wals_in_manifest && wal.GetPreSyncSize() > 0) { synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize())); } - logs_to_free_.push_back(wal.ReleaseWriter()); - it = logs_.erase(it); + if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) { + // Fully synced + logs_to_free_.push_back(wal.ReleaseWriter()); + it = logs_.erase(it); + } else { + assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize()); + wal.FinishSync(); + ++it; + } } else { + assert(wal.number == logs_.back().number); + // Active WAL wal.FinishSync(); ++it; } } - assert(logs_.empty() || logs_[0].number > up_to || - (logs_.size() == 1 && !logs_[0].IsSyncing())); log_sync_cv_.SignalAll(); } diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 21461bdcbf9..c13e8b9b9fd 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -386,6 +386,51 @@ TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) { Close(); } +TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) { + // Repro bug where a WAL is appended and switched after + // `FlushWAL(true /* sync */)`'s sync finishes and before it untracks fully + // synced inactive logs. Previously such a WAL would be wrongly untracked + // so the final append would never be synced. + Options options = GetOptions(); + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + options.env = fault_env.get(); + Reopen(options); + + ASSERT_OK(Put("key1", "val1")); + + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", [this](void* /* arg */) { + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->FlushWAL(true /* sync */)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_OK(Put("key3", "val3")); + + ASSERT_OK(db_->FlushWAL(true /* sync */)); + + Close(); + + // Simulate full loss of unsynced data. This should drop nothing since we did + // `FlushWAL(true /* sync */)` before `Close()`. + fault_env->DropUnsyncedFileData(); + + Reopen(options); + + ASSERT_EQ("val1", Get("key1")); + ASSERT_EQ("val2", Get("key2")); + ASSERT_EQ("val3", Get("key3")); + + // Need to close before `fault_env` goes out of scope. + Close(); +} + TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { std::unique_ptr mock_env( new FaultInjectionTestEnv(env_));