OCPBUGS-60098: podman-etcd: prevent last active member from leaving the etcd member list

clobrano · clobrano · commit f049fb89db76 · 2025-11-19T17:40:02.000+01:00
When stopping etcd instances, simultaneous member removal from both
nodes can corrupt the etcd Write-Ahead Log (WAL). This change implements
a two-part solution:

1. Concurrent stop protection: When multiple nodes are stopping, the
   alphabetically second node delays its member removal by 10
   seconds. This prevents simultaneous member list updates that can
   corrupt WAL.

2. Last member detection: Checks active resource count after any
   delay. If this is the last active member, skips member removal to
   avoid leaving an empty cluster.

Additionally, reorders podman_stop() to clear the member_id attribute
after leaving the member list, ensuring the attribute reflects actual
cluster state during shutdown.
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
@@ -1639,7 +1639,7 @@ can_reuse_container() {
 		OCF_RESKEY_reuse=0
 		return "$OCF_SUCCESS"
 	fi
-	
+
 	if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
 		return $OCF_ERR_GENERIC
 	fi
@@ -2021,6 +2021,62 @@ podman_start()
 	done
 }
 
+# leave_etcd_member_list removes the current node from the etcd member list during
+# shutdown to ensure clean cluster state.
+#
+# Skips removal if this is the standalone (last) node. When both nodes are stopping
+# concurrently, delays the second node to prevent simultaneous member removal that
+# could corrupt the etcd WAL.
+leave_etcd_member_list()
+{
+	if ! member_id=$(attribute_node_member_id get); then
+		ocf_log err "error leaving members list: could not get member-id"
+		return
+	fi
+
+	if is_standalone; then
+		ocf_log info "last member. Not leaving the member list"
+		return
+	fi
+
+	local stopping_resources_count
+	stopping_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_stop_resource" | wc -w)
+	ocf_log info "found '$stopping_resources_count' stopping etcd resources (stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
+	if [ "$stopping_resources_count" -gt 1 ]; then
+		# Prevent WAL corruption by delaying the alphabetically second node's member
+		# removal when both nodes are stopping concurrently.
+		local delayed_node
+
+		node_names_sorted=$(echo "$OCF_RESKEY_node_ip_map" | sed 's/:[^;]*//g; s/;/ /g' | tr ' ' '\n' | sort  | tr '\n' ' ')
+		delayed_node="$(echo "$node_names_sorted" | cut -d' ' -f2)"
+
+		if [ -z "$delayed_node" ]; then
+			ocf_log warn "could not determine node to be delayed: not leaving the member list"
+			return
+		fi
+
+		if [ "$NODENAME" = "$delayed_node" ]; then
+			ocf_log info "delaying stop for ${DELAY_SECOND_NODE_LEAVE_SEC}s to prevent simultaneous etcd member removal"
+			sleep $DELAY_SECOND_NODE_LEAVE_SEC
+		fi
+	fi
+
+	local active_resources_count
+	active_resources_count=$(get_truly_active_resources_count)
+	if [ "$active_resources_count" -lt 1 ]; then
+		ocf_log info "last member. Not leaving the member list"
+		return
+	fi
+
+	ocf_log info "leaving members list as member with ID $member_id"
+	local endpoint
+	endpoint="$(ip_url $(attribute_node_ip get)):2379"
+	if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
+		rc=$?
+		ocf_log err "error leaving members list, error code: $rc"
+	fi
+}
+
 podman_stop()
 {
 	local timeout=60
@@ -2042,21 +2098,7 @@ podman_stop()
 		return $OCF_SUCCESS
 	fi
 
-	if ! member_id=$(attribute_node_member_id get); then
-		ocf_log err "error leaving members list: could not get member-id"
-	else
-		# TODO: is it worth/possible to check the current status instead than relying on cached attributes?
-		if is_standalone; then
-			ocf_log info "last member. Not leaving the member list"
-		else
-			ocf_log info "leaving members list as member with ID $member_id"
-			endpoint="$(ip_url $(attribute_node_ip get)):2379"
-			if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
-				rc=$?
-				ocf_log err "error leaving members list, error code: $rc"
-			fi
-		fi
-	fi
+	leave_etcd_member_list
 	attribute_node_member_id clear
 
 	if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
@@ -2197,6 +2239,7 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
 # State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
 # This is intentional - reboots are controlled stops, not failures requiring detection.
 CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
+DELAY_SECOND_NODE_LEAVE_SEC=10
 
 # Note: we currently monitor podman containers by with the "podman exec"
 # command, so make sure that invocation is always valid by enforcing the