Skip to content

Commit ce5eff1

Browse files
committed
OCPBUGS-60098: podman-etcd: avoid leaving member list on last active agent
When stopping an etcd instance, the agent should not leave the member list if it's the last active agent in the cluster. Leaving the member list in this scenario can cause WAL corruption. This change introduces a check for the number of active resources before attempting to leave the member list. If no other active resources are found, the agent will log a message and skip the leave operation. NOTE: the check on `standalone_node` might not be enough if both agents stop roughly at the same time, hence none of them has enough time to set the attribute.
1 parent 677e3ad commit ce5eff1

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

heartbeat/podman-etcd

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,6 +2021,35 @@ podman_start()
20212021
done
20222022
}
20232023

2024+
leave_etcd_member_list()
2025+
{
2026+
if ! member_id=$(attribute_node_member_id get); then
2027+
ocf_log err "error leaving members list: could not get member-id"
2028+
return
2029+
fi
2030+
2031+
if is_standalone; then
2032+
ocf_log info "last member. Not leaving the member list"
2033+
return
2034+
fi
2035+
2036+
local active_resources_count
2037+
active_resources_count=$(get_truly_active_resources_count)
2038+
ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
2039+
if [ "$active_resources_count" -lt 1 ]; then
2040+
ocf_log info "No active agents left. Not leaving the member list"
2041+
return
2042+
fi
2043+
2044+
ocf_log info "leaving members list as member with ID $member_id"
2045+
local endpoint
2046+
endpoint="$(ip_url $(attribute_node_ip get)):2379"
2047+
if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
2048+
rc=$?
2049+
ocf_log err "error leaving members list, error code: $rc"
2050+
fi
2051+
}
2052+
20242053
podman_stop()
20252054
{
20262055
local timeout=60
@@ -2035,29 +2064,15 @@ podman_stop()
20352064

20362065
attribute_node_revision update
20372066
attribute_node_cluster_id update
2067+
attribute_node_member_id clear
20382068

20392069
podman_simple_status
20402070
if [ $? -eq $OCF_NOT_RUNNING ]; then
20412071
ocf_log info "could not leave members list: etcd container not running"
20422072
return $OCF_SUCCESS
20432073
fi
20442074

2045-
if ! member_id=$(attribute_node_member_id get); then
2046-
ocf_log err "error leaving members list: could not get member-id"
2047-
else
2048-
# TODO: is it worth/possible to check the current status instead than relying on cached attributes?
2049-
if is_standalone; then
2050-
ocf_log info "last member. Not leaving the member list"
2051-
else
2052-
ocf_log info "leaving members list as member with ID $member_id"
2053-
endpoint="$(ip_url $(attribute_node_ip get)):2379"
2054-
if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
2055-
rc=$?
2056-
ocf_log err "error leaving members list, error code: $rc"
2057-
fi
2058-
fi
2059-
fi
2060-
attribute_node_member_id clear
2075+
leave_etcd_member_list
20612076

20622077
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
20632078
timeout=$(((OCF_RESKEY_CRM_meta_timeout/1000) -10 ))

0 commit comments

Comments
 (0)