Skip to content

Commit aa19b65

Browse files
committed
OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
Clear stale learner_node attribute during stop and on restart when no active resources exist, ensuring learner always waits for peer availability.
1 parent 6cd23a8 commit aa19b65

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

heartbeat/podman-etcd

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ reconcile_member_state()
10641064

10651065
if [ -n "$learner_member_id" ]; then
10661066
if ! promote_learner_member "$learner_member_id"; then
1067-
return $?
1067+
return $OCF_ERR_GENERIC
10681068
fi
10691069
# promotion succeded: continue to clear standalone_node and learner_node
10701070
fi
@@ -1258,6 +1258,7 @@ manage_peer_membership()
12581258
set_standalone_node
12591259
else
12601260
ocf_log debug "$name is in the members list by IP: $ip"
1261+
# Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
12611262
reconcile_member_state "$member_list_json"
12621263
fi
12631264
done
@@ -1369,7 +1370,7 @@ container_health_check()
13691370
# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
13701371
local time_since_heartbeat
13711372
time_since_heartbeat=$(get_time_since_last_heartbeat)
1372-
ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
1373+
ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
13731374

13741375
# Check if peer has set force_new_cluster for recovery
13751376
local fnc_holders
@@ -1796,6 +1797,9 @@ podman_start()
17961797
fi
17971798
;;
17981799
0)
1800+
# No active resources: clear any stale learner_node attribute from previous failed session
1801+
ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
1802+
attribute_learner_node clear
17991803
# count how many agents are starting now
18001804
local start_resources_count
18011805
start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -2033,6 +2037,7 @@ podman_stop()
20332037
ocf_log err "could not delete container health check state file"
20342038
fi
20352039

2040+
attribute_learner_node clear
20362041
attribute_node_revision update
20372042
attribute_node_cluster_id update
20382043

0 commit comments

Comments
 (0)