Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 128 additions & 8 deletions heartbeat/podman-etcd
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
OCF_RESKEY_oom_default="-997"
OCF_RESKEY_config_location_default="/var/lib/etcd"
OCF_RESKEY_backup_location_default="/var/lib/etcd"
OCF_RESKEY_max_backup_snapshots_default="3"

: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
Expand All @@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}


#######################################################################
Expand Down Expand Up @@ -275,6 +277,16 @@ The directory where the resource agent stores its backups.
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
</parameter>

<parameter name="max_backup_snapshots" required="0" unique="0">
<longdesc lang="en">
Maximum number of etcd database snapshots to retain. When a new snapshot is created,
older snapshots will be automatically removed to maintain this limit. This helps
control storage usage while ensuring recent backups are available for recovery.
</longdesc>
<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
</parameter>

</parameters>

<actions>
Expand Down Expand Up @@ -702,20 +714,127 @@ EOF
} >> "$ETCD_CONFIGURATION_FILE"
}

# Archive etcd database with backup and cleanup
#
# Creates a timestamped snapshot of the etcd database and removes old member
# directory to allow the node to rejoin the cluster as a learner.
#
# Error handling strategy:
# - Backup failures (copy, validation) return OCF_SUCCESS to prevent blocking
# cluster recovery. Backups are beneficial but not critical for recovery.
# - Member directory removal failure returns OCF_ERR_GENERIC because this
# operation is critical for cluster rejoin.
#
# NOTE: the agent cannot use etcdctl/etcdutl utilities as there
# is no running etcd server when this backup is performed.
#
# Returns:
# OCF_SUCCESS - Archive completed or backup failed non-critically
# OCF_ERR_GENERIC - Critical failure removing member directory
archive_data_folder()
{
# TODO: use etcd snapshots
local dest_dir_name
local data_dir="/var/lib/etcd/member"
local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
local backup_dir="$OCF_RESKEY_backup_location"
local timestamp
local backup_file
local backup_size

# Check if the etcd database file exists
if [ ! -f "$etcd_db_path" ]; then
ocf_log warn "etcd database file not found at $etcd_db_path, skipping backup"
return $OCF_SUCCESS
fi

dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
if [ ! -d $data_dir ]; then
ocf_log info "no data dir to backup"
# Ensure backup directory exists
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to make sure there is enough disk space before committing to the backup? I worry about users setting a high rotation threshold and filling the disk

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point!
I wonder how to estimate the threshold. From one side, I can use the other backups (if any, of course) + some guard, but if 1 single new backup can fill up the disk, it is probably too late already 😁. I wonder if we want to use a percentage of the disk available instead 🤔

Copy link
Contributor

@fonta-rh fonta-rh Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm torn. The simple "will we fill the disk" check is very straightforward, but as you say, might be useless and doesn't give any options to the user.

Adding a config parameter for the threshold would be the more complete option, but it's a lot more code. Just the check function might be something like this (Claude dixit), and you still need to call it, document it, etcd

# Check if filesystem has sufficient space for backup based on usage threshold
#
# Args:
#   $1 - Directory path to check
#
# Returns:
#   0 - Sufficient space available
#   1 - Insufficient space (usage above threshold)
check_backup_disk_space()
{
	local backup_dir="$1"
	local threshold="$OCF_RESKEY_backup_disk_usage_threshold"
	local usage_percent
	local fs_path

	# Validate threshold is a number between 0-100
	if ! echo "$threshold" | grep -q '^[0-9]\{1,3\}$' || [ "$threshold" -gt 100 ]; then
		ocf_log warn "invalid backup_disk_usage_threshold value: $threshold (must be 0-100), using default"
		threshold="$OCF_RESKEY_backup_disk_usage_threshold_default"
	fi

	# Get filesystem usage percentage (without % sign)
	# df output: Filesystem Size Used Avail Use% Mounted
	usage_percent=$(df -P "$backup_dir" 2>/dev/null | awk 'NR==2 {sub(/%/, "", $5); print $5}')

	if [ -z "$usage_percent" ]; then
		ocf_log warn "could not determine disk usage for $backup_dir, skipping space check"
		return 0
	fi

	if [ "$usage_percent" -ge "$threshold" ]; then
		ocf_log warn "backup filesystem usage ${usage_percent}% exceeds threshold ${threshold}%, skipping backup"
		return 1
	fi

	ocf_log debug "backup filesystem usage ${usage_percent}% is below threshold ${threshold}%"
	return 0
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can use the other backups (if any, of course) + some guard

of course I was hallucinating here... we are backing up 1 specific file, I know what size it has 🤦

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The simple "will we fill the disk" check is very straightforward, but as you say, might be useless and doesn't give any options to the user.

Considering that this is going to go on a kubernetes node that has better ways to detect disk pressure, I don't think we need to complicate this change further. What do you think?

Copy link
Contributor

@fonta-rh fonta-rh Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, what about just making we're not the ones filling the disk with something like:

# Check available disk space before creating backup
local available_kb
available_kb=$(df -P "$backup_dir" | awk 'NR==2 {print $4}')
local db_size_kb
db_size_kb=$(($(stat -c %s "$etcd_db_path") / 1024))

if [ "$available_kb" -lt "$((db_size_kb * 2))" ]; then
    ocf_log warn "insufficient disk space in $backup_dir (available: ${available_kb}KB, needed: ~$((db_size_kb * 2))KB), skipping backup"
    return $OCF_SUCCESS
fi

and it's out of our hands!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I came up with a similar code 😁 🤖

if [ ! -d "$backup_dir" ]; then
ocf_log info "creating backup directory: $backup_dir"
if ! mkdir -p "$backup_dir"; then
ocf_log err "failed to create backup directory $backup_dir, skipping backup"
return $OCF_SUCCESS
fi
fi

# Generate timestamp and backup filename
timestamp=$(date +%Y%m%d-%H%M%S)
backup_file="$backup_dir/etcd-snapshot-$timestamp.db"

ocf_log info "creating etcd database backup: $backup_file"

# Create the backup by copying the database file
if ! cp "$etcd_db_path" "$backup_file"; then
ocf_log err "failed to create backup file $backup_file, error code: $?"
return $OCF_SUCCESS
fi
ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"

# Validate the backup file exists and has non-zero size
if [ ! -f "$backup_file" ]; then
ocf_log err "backup validation failed: file $backup_file does not exist"
return $OCF_SUCCESS
fi

backup_size=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
if [ "$backup_size" -eq 0 ]; then
ocf_log err "backup validation failed: file $backup_file has zero size"
rm -f "$backup_file"
return $OCF_SUCCESS
fi

ocf_log info "backup created successfully: $backup_file (size: $backup_size bytes)"

# Cleanup old backups based on retention policy
cleanup_old_backups "$backup_dir"

# Remove Etcd "member" folder to allow the client to rejoin as learner
ocf_log info "Removing Etcd members directory to allow rejoin as learner"
if ! rm -rf "$ETCD_MEMBER_DIR"; then
ocf_log err "could not remove $ETCD_MEMBER_DIR, error code: $?"
return $OCF_ERR_GENERIC
fi

sync
return $OCF_SUCCESS
}

cleanup_old_backups()
{
local backup_dir="$1"
local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
local backup_count
local backups_to_remove
local old_backups

# Validate max_snapshots is a positive integer
if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
ocf_log warn "invalid max_backup_snapshots value: $max_snapshots, skipping cleanup"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to update the error message to give more info about the regex valitdation. Something like "invalid max_backup_snapshots value: $max_snapshots: it should be a number between 0 and 99. Skipping cleanup"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I wonder if we want to allow 0 snapshots, to disable the feature 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as it's not the default and we document it, I think that's a good option to give, and easy to add at this point :)

Copy link
Collaborator Author

@clobrano clobrano Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, we must take into account that "configurability" now means we need to change CEO to configure the agent differently 😁
In the future, there might be some real configurability and this option will be useful

return $OCF_SUCCESS
fi

# Count existing backup files
backup_count=$(find "$backup_dir" -maxdepth 1 -name "etcd-snapshot-*.db" -type f 2>/dev/null | wc -l)

if [ "$backup_count" -le "$max_snapshots" ]; then
ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
return $OCF_SUCCESS
fi

# Calculate how many backups to remove
backups_to_remove=$((backup_count - max_snapshots))
ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"

# Find oldest backups sorted by modification time
old_backups=$(find "$backup_dir" -maxdepth 1 -name "etcd-snapshot-*.db" -type f -printf '%T@ %p\n' 2>/dev/null | \
sort -n | \
head -n "$backups_to_remove" | \
cut -d' ' -f2-)

if [ -n "$old_backups" ]; then
ocf_log info "removing old backups: $old_backups"
if ! echo "$old_backups" | xargs -r rm -f; then
ocf_log warn "failed to remove some old backups, error code: $?"
fi
fi

return $OCF_SUCCESS
}

etcd_pod_container_exists() {
Expand Down Expand Up @@ -2189,6 +2308,7 @@ CONTAINER=$OCF_RESKEY_name
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
ETCD_MEMBER_DIR="/var/lib/etcd/member"
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
ETCD_REVISION_BUMP_PERCENTAGE=0.2
ETCD_BUMP_REV_DEFAULT=1000000000
Expand Down