Skip to content

Commit

Permalink
Backup and restore (#184)
Browse files Browse the repository at this point in the history
* Enable backup action and list backups.

Enabled unit tests for list and displaying backups

* Updated restore test

* Update timeouts and attempts for restore is pbm is
resyncing

* Address review comments

* Address review comments

* Fix lint errors

* Fix

* Update lib/charms/mongodb/v0/mongodb_backups.py

Inform a user about restore in-progress

Co-authored-by: Mykola Marzhan <303592+delgod@users.noreply.github.com>

---------

Co-authored-by: Mykola Marzhan <303592+delgod@users.noreply.github.com>
  • Loading branch information
dmitry-ratushnyy and delgod authored Aug 15, 2023
1 parent c3eb59d commit 5f3c059
Show file tree
Hide file tree
Showing 9 changed files with 404 additions and 156 deletions.
166 changes: 161 additions & 5 deletions lib/charms/mongodb/v0/mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
This user is named "backup".
"""

import json
import logging
import re
import subprocess
import time
from typing import Dict
from typing import Dict, List

from charms.data_platform_libs.v0.s3 import CredentialsChangedEvent, S3Requirer
from charms.mongodb.v0.helpers import (
Expand Down Expand Up @@ -63,6 +65,8 @@
REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([^,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z"
PBM_STATUS_CMD = ["status", "-o", "json"]
MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current"
RESTORE_MAX_ATTEMPTS = 5
RESTORE_ATTEMPT_COOLDOWN = 15


class ResyncError(Exception):
Expand All @@ -77,6 +81,22 @@ class PBMBusyError(Exception):
"""Raised when PBM is busy and cannot run another operation."""


class RestoreError(Exception):
"""Raised when backup operation is failed."""


def _restore_retry_before_sleep(retry_state) -> None:
logger.error(
f"Attempt {retry_state.attempt_number} failed. {RESTORE_MAX_ATTEMPTS - retry_state.attempt_number} attempts left. Retrying after {RESTORE_ATTEMPT_COOLDOWN} seconds."
),


def _restore_stop_condition(retry_state) -> bool:
if isinstance(retry_state.outcome.exception(), RestoreError):
return True
return retry_state.attempt_number >= RESTORE_MAX_ATTEMPTS


class MongoDBBackups(Object):
"""Manages MongoDB backups."""

Expand Down Expand Up @@ -143,7 +163,13 @@ def _on_create_backup_action(self, event) -> None:
event.fail(f"Cannot create backup {pbm_status.message}.")
return

# TODO create backup
try:
self.charm.run_pbm_command(["backup"])
event.set_results({"backup-status": "backup started"})
self.charm.unit.status = MaintenanceStatus("backup started/running")
except (subprocess.CalledProcessError, ExecError, Exception) as e:
event.fail(f"Failed to backup MongoDB with error: {str(e)}")
return

def _on_list_backups_action(self, event) -> None:
if self.model.get_relation(S3_RELATION) is None:
Expand All @@ -164,7 +190,12 @@ def _on_list_backups_action(self, event) -> None:
event.fail(f"Cannot list backups: {pbm_status.message}.")
return

# TODO list backups
try:
formatted_list = self._generate_backup_list_output()
event.set_results({"backups": formatted_list})
except (subprocess.CalledProcessError, ExecError) as e:
event.fail(f"Failed to list MongoDB backups with error: {str(e)}")
return

def _on_restore_action(self, event) -> None:
if self.model.get_relation(S3_RELATION) is None:
Expand All @@ -185,7 +216,6 @@ def _on_restore_action(self, event) -> None:
# cannot restore backup if pbm is not ready. This could be due to: resyncing, incompatible,
# options, incorrect credentials, creating a backup, or already performing a restore.
pbm_status = self._get_pbm_status()
# TOD check status
self.charm.unit.status = pbm_status
if isinstance(pbm_status, MaintenanceStatus):
event.fail("Please wait for current backup/restore to finish.")
Expand All @@ -198,7 +228,15 @@ def _on_restore_action(self, event) -> None:
event.fail(f"Cannot restore backup {pbm_status.message}.")
return

# TODO restore backup
# sometimes when we are trying to restore pmb can be resyncing, so we need to retry
try:
self._try_to_restore(backup_id)
event.set_results({"restore-status": "restore started"})
self.charm.unit.status = MaintenanceStatus("restore started/running")
except ResyncError:
raise
except RestoreError as restore_error:
event.fail(str(restore_error))

# BEGIN: helper functions

Expand Down Expand Up @@ -357,3 +395,121 @@ def _get_pbm_status(self) -> StatusBase:
# necessary to parse the output
logger.error(f"Failed to get pbm status: {e}")
return BlockedStatus("PBM error")

def _generate_backup_list_output(self) -> str:
"""Generates a list of backups in a formatted table.
List contains successful, failed, and in progress backups in order of ascending time.
Raises ExecError if pbm command fails.
"""
backup_list = []
pbm_status = self.charm.run_pbm_command(["status", "--out=json"])
# processes finished and failed backups
pbm_status = json.loads(pbm_status)
backups = pbm_status["backups"]["snapshot"] or []
for backup in backups:
backup_status = "finished"
if backup["status"] == "error":
# backups from a different cluster have an error status, but they should show as
# finished
if self._backup_from_different_cluster(backup.get("error", "")):
backup_status = "finished"
else:
# display reason for failure if available
backup_status = "failed: " + backup.get("error", "N/A")
if backup["status"] not in ["error", "done"]:
backup_status = "in progress"
backup_list.append((backup["name"], backup["type"], backup_status))

# process in progress backups
running_backup = pbm_status["running"]
if running_backup.get("type", None) == "backup":
# backups are sorted in reverse order
last_reported_backup = backup_list[0]
# pbm will occasionally report backups that are currently running as failed, so it is
# necessary to correct the backup list in this case.
if last_reported_backup[0] == running_backup["name"]:
backup_list[0] = (last_reported_backup[0], last_reported_backup[1], "in progress")
else:
backup_list.append((running_backup["name"], "logical", "in progress"))

# sort by time and return formatted output
return self._format_backup_list(sorted(backup_list, key=lambda pair: pair[0]))

def _format_backup_list(self, backup_list: List[str]) -> str:
"""Formats provided list of backups as a table."""
backups = ["{:<21s} | {:<12s} | {:s}".format("backup-id", "backup-type", "backup-status")]

backups.append("-" * len(backups[0]))
for backup_id, backup_type, backup_status in backup_list:
backups.append(
"{:<21s} | {:<12s} | {:s}".format(backup_id, backup_type, backup_status)
)

return "\n".join(backups)

def _backup_from_different_cluster(self, backup_status: str) -> bool:
"""Returns if a given backup was made on a different cluster."""
return re.search(REMAPPING_PATTERN, backup_status) is not None

def _try_to_restore(self, backup_id: str) -> None:
for attempt in Retrying(
stop=_restore_stop_condition,
wait=wait_fixed(RESTORE_ATTEMPT_COOLDOWN),
reraise=True,
before_sleep=_restore_retry_before_sleep,
):
with attempt:
try:
remapping_args = self._remap_replicaset(backup_id)
self.charm.run_pbm_restore_command(backup_id, remapping_args)
except (subprocess.CalledProcessError, ExecError) as e:
if type(e) == subprocess.CalledProcessError:
error_message = e.output.decode("utf-8")
else:
error_message = str(e.stderr)
fail_message = f"Failed to restore MongoDB with error: {str(e)}"

if "Resync" in error_message:
raise ResyncError

if f"backup '{backup_id}' not found" in error_message:
fail_message = f"Backup id: {backup_id} does not exist in list of backups, please check list-backups for the available backup_ids."

raise RestoreError(fail_message)

def _remap_replicaset(self, backup_id: str) -> str:
"""Returns options for remapping a replica set during a cluster migration restore.
Args:
backup_id: str of the backup to check for remapping
Raises: CalledProcessError
"""
pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD)
pbm_status = json.loads(pbm_status)

# grab the error status from the backup if present
backups = pbm_status["backups"]["snapshot"] or []
backup_status = ""
for backup in backups:
if not backup_id == backup["name"]:
continue

backup_status = backup.get("error", "")
break

if not self._backup_from_different_cluster(backup_status):
return ""

# TODO in the future when we support conf servers and shards this will need to be more
# comprehensive.
old_cluster_name = re.search(REMAPPING_PATTERN, backup_status).group(1)
current_cluster_name = self.charm.app.name
logger.debug(
"Replica set remapping is necessary for restore, old cluster name: %s ; new cluster name: %s",
old_cluster_name,
current_cluster_name,
)
return f"--replset-remapping {current_cluster_name}={old_cluster_name}"
16 changes: 15 additions & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
ActiveStatus,
BlockedStatus,
Container,
MaintenanceStatus,
Relation,
RelationDataContent,
SecretNotFoundError,
Expand Down Expand Up @@ -507,6 +508,12 @@ def _on_get_password(self, event: ActionEvent) -> None:

def _on_set_password(self, event: ActionEvent) -> None:
"""Set the password for the specified user."""
# changing the backup password while a backup/restore is in progress can be disastrous
pbm_status = self.backups._get_pbm_status()
if isinstance(pbm_status, MaintenanceStatus):
event.fail("Cannot change password while a backup/restore is in progress.")
return

# only leader can write the new password into peer relation.
if not self.unit.is_leader():
event.fail("The action can be run only on leader unit.")
Expand Down Expand Up @@ -685,7 +692,7 @@ def _get_mongodb_config_for_user(
password = self.get_secret(APP_SCOPE, user.get_password_key_name())
if not password:
raise MissingSecretError(
"Password for {APP_SCOPE}, {user.get_username()} couldn't be retrieved"
f"Password for '{APP_SCOPE}', '{user.get_username()}' couldn't be retrieved"
)
else:
return MongoDBConfiguration(
Expand Down Expand Up @@ -1127,6 +1134,13 @@ def run_pbm_command(self, cmd: List[str]) -> str:
stdout, _ = process.wait_output()
return stdout

def run_pbm_restore_command(self, backup_id: str, remapping_args: str) -> str:
"""Executes a restore command in the workload container."""
restore_cmd = ["restore", backup_id]
if remapping_args:
restore_cmd = restore_cmd + remapping_args.split(" ")
return self.run_pbm_command(restore_cmd)

def set_pbm_config_file(self) -> None:
"""Sets the pbm config file."""
container = self.unit.get_container(Config.CONTAINER_NAME)
Expand Down
Loading

0 comments on commit 5f3c059

Please sign in to comment.