ClusterLabs · clobrano · Nov 1, 2025 · Nov 1, 2025 · Nov 1, 2025 · jaypoulz
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
@@ -880,7 +880,7 @@ add_member_as_learner()
 	local endpoint_url=$(ip_url $(attribute_node_ip get))
 	local peer_url=$(ip_url $member_ip)
 
-	ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
+	ocf_log info "add $member_name ($member_ip) to the member list as learner"
 	out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
 	rc=$?
 	if [ $rc -ne 0 ]; then
@@ -1032,7 +1032,7 @@ promote_learner_member()
 	if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
 		# promotion is expected to fail if the peer is not yet up-to-date
 		ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
-		return $OCF_SUCCESS
+		return $OCF_ERR_GENERIC
 	fi
 	ocf_log info "successfully promoted member '$learner_member_id_hex'"
 	return $OCF_SUCCESS
@@ -1063,19 +1063,19 @@ reconcile_member_state()
 	fi
 
 	if [ -n "$learner_member_id" ]; then
-		promote_learner_member "$learner_member_id"
-		return $?
-	fi
-
-	if [ -z "$learner_member_id" ]; then
-		if ! clear_standalone_node; then
-			ocf_log error "could not clear standalone_node attribute, error code: $?"
-			return $OCF_ERR_GENERIC
-		fi
-		if ! attribute_learner_node clear; then
-			ocf_log error "could not clear learner_node attribute, error code: $?"
+		if ! promote_learner_member "$learner_member_id"; then
 			return $OCF_ERR_GENERIC
 		fi
+		# promotion succeded: continue to clear standalone_node and learner_node
+	fi
+
+	if ! clear_standalone_node; then
+		ocf_log error "could not clear standalone_node attribute, error code: $?"
+		return $OCF_ERR_GENERIC
+	fi
+	if ! attribute_learner_node clear; then
+		ocf_log error "could not clear learner_node attribute, error code: $?"
+		return $OCF_ERR_GENERIC
 	fi
 
 	return $OCF_SUCCESS
@@ -1258,6 +1258,7 @@ manage_peer_membership()
 			set_standalone_node
 		else
 			ocf_log debug "$name is in the members list by IP: $ip"
+			# Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
 			reconcile_member_state "$member_list_json"
 		fi
 	done
@@ -1369,7 +1370,7 @@ container_health_check()
 	# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
 	local time_since_heartbeat
 	time_since_heartbeat=$(get_time_since_last_heartbeat)
-	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
 
 	# Check if peer has set force_new_cluster for recovery
 	local fnc_holders
@@ -1796,6 +1797,9 @@ podman_start()
 				fi
 				;;
 			0)
+				# No active resources: clear any stale learner_node attribute from previous failed session
+				ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
+				attribute_learner_node clear
 				# count how many agents are starting now
 				local start_resources_count
 				start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -2033,6 +2037,7 @@ podman_stop()
 		ocf_log err "could not delete container health check state file"
 	fi
 
+	attribute_learner_node clear
 	attribute_node_revision update
 	attribute_node_cluster_id update