Skip to content

Commit c2a4b78

Browse files
authored
WAITAOF: Update fsynced_reploff_pending even if there's nothing to fsync (redis#12622)
The problem is that WAITAOF could have hang in case commands were propagated only to replicas. This can happen if a module uses RM_Call with the REDISMODULE_ARGV_NO_AOF flag. In that case, master_repl_offset would increase, but there would be nothing to fsync, so in the absence of other traffic, fsynced_reploff_pending would stay the static, and WAITAOF can hang. This commit updates fsynced_reploff_pending to the latest offset in flushAppendOnlyFile in case there's nothing to fsync. i.e. in case it's behind because of the above mentions case it'll be refreshed and release the WAITAOF. Other changes: Fix a race in wait.tcl (client getting blocked vs. the fsync thread)
1 parent bfa3931 commit c2a4b78

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

src/aof.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,13 @@ void flushAppendOnlyFile(int force) {
10871087
{
10881088
goto try_fsync;
10891089
} else {
1090+
/* All data is fsync'd already: Update fsynced_reploff_pending just in case.
1091+
* This is needed to avoid a WAITAOF hang in case a module used RM_Call with the NO_AOF flag,
1092+
* in which case master_repl_offset will increase but fsynced_reploff_pending won't be updated
1093+
* (because there's no reason, from the AOF POV, to call fsync) and then WAITAOF may wait on
1094+
* the higher offset (which contains data that was only propagated to replicas, and not to AOF) */
1095+
if (!sync_in_progress && server.aof_fsync != AOF_FSYNC_NO)
1096+
atomicSet(server.fsynced_reploff_pending, server.master_repl_offset);
10901097
return;
10911098
}
10921099
}

tests/unit/moduleapi/usercall.tcl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,36 @@ start_server {tags {"modules usercall"}} {
133133
assert_equal [dict get $entry reason] {command}
134134
assert_match {*cmd=usercall.call_with_user_flag*} [dict get $entry client-info]
135135
}
136+
137+
start_server {tags {"wait aof network external:skip"}} {
138+
set slave [srv 0 client]
139+
set slave_host [srv 0 host]
140+
set slave_port [srv 0 port]
141+
set slave_pid [srv 0 pid]
142+
set master [srv -1 client]
143+
set master_host [srv -1 host]
144+
set master_port [srv -1 port]
145+
146+
$master config set appendonly yes
147+
$master config set appendfsync everysec
148+
$slave config set appendonly yes
149+
$slave config set appendfsync everysec
150+
151+
test {Setup slave} {
152+
$slave slaveof $master_host $master_port
153+
wait_for_condition 50 100 {
154+
[s 0 master_link_status] eq {up}
155+
} else {
156+
fail "Replication not started."
157+
}
158+
}
159+
160+
test {test module replicate only to replicas and WAITAOF} {
161+
$master set x 1
162+
assert_equal [$master waitaof 1 1 10000] {1 1}
163+
$master usercall.call_with_user_flag A! config set loglevel notice
164+
# Make sure WAITAOF doesn't hang
165+
assert_equal [$master waitaof 1 1 10000] {1 1}
166+
}
167+
}
136168
}

tests/unit/wait.tcl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,10 @@ tags {"wait aof network external:skip"} {
121121
r config set appendfsync always
122122
$master incr foo
123123
assert_equal [$master waitaof 1 0 0] {1 0}
124-
r config set appendfsync everysec
125124
}
126125

127126
test {WAITAOF local wait and then stop aof} {
127+
r config set appendfsync no
128128
set rd [redis_deferring_client]
129129
$rd incr foo
130130
$rd read

0 commit comments

Comments
 (0)