Skip to content

Commit bfa3931

Browse files
authored
WAITAOF: Update fsynced_reploff_pending just before starting the initial AOFRW fork (redis#12620)
If we set `fsynced_reploff_pending` in `startAppendOnly`, and the fork doesn't start immediately (e.g. there's another fork active at the time), any subsequent commands will increment `server.master_repl_offset`, but will not cause a fsync (given they were executed before the fork started, they just ended up in the RDB part of it) Therefore, any WAITAOF will wait on the new master_repl_offset, but it will time out because no fsync will be executed. Release notes: ``` WAITAOF could timeout in the absence of write traffic in case a new AOF is created and an AOFRW can't immediately start. This can happen by the appendonly config is changed at runtime, but also after FLUSHALL, and replica full sync. ```
1 parent f924beb commit bfa3931

File tree

2 files changed

+47
-12
lines changed

2 files changed

+47
-12
lines changed

src/aof.c

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -976,18 +976,6 @@ void stopAppendOnly(void) {
976976
int startAppendOnly(void) {
977977
serverAssert(server.aof_state == AOF_OFF);
978978

979-
/* Wait for all bio jobs related to AOF to drain. This prevents a race
980-
* between updates to `fsynced_reploff_pending` of the worker thread, belonging
981-
* to the previous AOF, and the new one. This concern is specific for a full
982-
* sync scenario where we don't wanna risk the ACKed replication offset
983-
* jumping backwards or forward when switching to a different master. */
984-
bioDrainWorker(BIO_AOF_FSYNC);
985-
986-
/* Set the initial repl_offset, which will be applied to fsynced_reploff
987-
* when AOFRW finishes (after possibly being updated by a bio thread) */
988-
atomicSet(server.fsynced_reploff_pending, server.master_repl_offset);
989-
server.fsynced_reploff = 0;
990-
991979
server.aof_state = AOF_WAIT_REWRITE;
992980
if (hasActiveChildProcess() && server.child_type != CHILD_TYPE_AOF) {
993981
server.aof_rewrite_scheduled = 1;
@@ -2454,7 +2442,23 @@ int rewriteAppendOnlyFileBackground(void) {
24542442
server.aof_lastbgrewrite_status = C_ERR;
24552443
return C_ERR;
24562444
}
2445+
2446+
if (server.aof_state == AOF_WAIT_REWRITE) {
2447+
/* Wait for all bio jobs related to AOF to drain. This prevents a race
2448+
* between updates to `fsynced_reploff_pending` of the worker thread, belonging
2449+
* to the previous AOF, and the new one. This concern is specific for a full
2450+
* sync scenario where we don't wanna risk the ACKed replication offset
2451+
* jumping backwards or forward when switching to a different master. */
2452+
bioDrainWorker(BIO_AOF_FSYNC);
2453+
2454+
/* Set the initial repl_offset, which will be applied to fsynced_reploff
2455+
* when AOFRW finishes (after possibly being updated by a bio thread) */
2456+
atomicSet(server.fsynced_reploff_pending, server.master_repl_offset);
2457+
server.fsynced_reploff = 0;
2458+
}
2459+
24572460
server.stat_aof_rewrites++;
2461+
24582462
if ((childpid = redisFork(CHILD_TYPE_AOF)) == 0) {
24592463
char tmpfile[256];
24602464

tests/unit/wait.tcl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,37 @@ tags {"wait aof network external:skip"} {
140140
assert_error {ERR WAITAOF cannot be used when numlocal is set but appendonly is disabled.} {$master waitaof 1 0 0}
141141
}
142142

143+
test {WAITAOF local if AOFRW was postponed} {
144+
r config set appendfsync everysec
145+
146+
# turn off AOF
147+
r config set appendonly no
148+
149+
# create an RDB child that takes a lot of time to run
150+
r set x y
151+
r config set rdb-key-save-delay 100000000 ;# 100 seconds
152+
r bgsave
153+
assert_equal [s rdb_bgsave_in_progress] 1
154+
155+
# turn on AOF
156+
r config set appendonly yes
157+
assert_equal [s aof_rewrite_scheduled] 1
158+
159+
# create a write command (to increment master_repl_offset)
160+
r set x y
161+
162+
# reset save_delay and kill RDB child
163+
r config set rdb-key-save-delay 0
164+
catch {exec kill -9 [get_child_pid 0]}
165+
166+
# wait for AOF (will unblock after AOFRW finishes)
167+
assert_equal [r waitaof 1 0 10000] {1 0}
168+
169+
# make sure AOFRW finished
170+
assert_equal [s aof_rewrite_in_progress] 0
171+
assert_equal [s aof_rewrite_scheduled] 0
172+
}
173+
143174
$master config set appendonly yes
144175
waitForBgrewriteaof $master
145176

0 commit comments

Comments
 (0)