Skip to content

Commit

Permalink
[DPE-1918] Improve logs for backup (#189)
Browse files Browse the repository at this point in the history
* Improve logs for backup when

backup related actions are successful

* Added retries for backup and

improved logs for backup\restore results

* Added doc strings
  • Loading branch information
dmitry-ratushnyy authored Aug 23, 2023
1 parent 7add0ff commit f06a956
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 32 deletions.
22 changes: 17 additions & 5 deletions lib/charms/mongodb/v0/helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Simple functions, which can be used in both K8s and VM charms."""
# Copyright 2023 Canonical Ltd.
# See LICENSE file for licensing details.

import logging
import os
import re
import secrets
import string
import subprocess
Expand Down Expand Up @@ -236,6 +236,9 @@ def current_pbm_op(pbm_status: str) -> str:

def process_pbm_status(pbm_status: str) -> StatusBase:
"""Parses current pbm operation and returns unit status."""
if type(pbm_status) == bytes:
pbm_status = pbm_status.decode("utf-8")

# pbm is running resync operation
if "Resync" in current_pbm_op(pbm_status):
return WaitingStatus("waiting to sync s3 configurations.")
Expand All @@ -244,10 +247,19 @@ def process_pbm_status(pbm_status: str) -> StatusBase:
if "(none)" in current_pbm_op(pbm_status):
return ActiveStatus("")

if "Snapshot backup" in current_pbm_op(pbm_status):
return MaintenanceStatus("backup started/running")
# Example of backup id: 2023-08-21T13:08:22Z
backup_match = re.search(
r'Snapshot backup "(?P<backup_id>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"', pbm_status
)
if backup_match:
backup_id = backup_match.group("backup_id")
return MaintenanceStatus(f"backup started/running, backup id:'{backup_id}'")

if "Snapshot restore" in current_pbm_op(pbm_status):
return MaintenanceStatus("restore started/running")
restore_match = re.search(
r'Snapshot restore "(?P<backup_id>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"', pbm_status
)
if restore_match:
backup_id = restore_match.group("backup_id")
return MaintenanceStatus(f"restore started/running, backup id:'{backup_id}'")

return ActiveStatus()
140 changes: 119 additions & 21 deletions lib/charms/mongodb/v0/mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@
REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([^,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z"
PBM_STATUS_CMD = ["status", "-o", "json"]
MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current"
RESTORE_MAX_ATTEMPTS = 5
RESTORE_ATTEMPT_COOLDOWN = 15
BACKUP_RESTORE_MAX_ATTEMPTS = 5
BACKUP_RESTORE_ATTEMPT_COOLDOWN = 15


class ResyncError(Exception):
Expand All @@ -85,16 +85,26 @@ class RestoreError(Exception):
"""Raised when backup operation is failed."""


def _restore_retry_before_sleep(retry_state) -> None:
class BackupError(Exception):
"""Raised when backup operation is failed."""


def _backup_restore_retry_before_sleep(retry_state) -> None:
logger.error(
f"Attempt {retry_state.attempt_number} failed. {RESTORE_MAX_ATTEMPTS - retry_state.attempt_number} attempts left. Retrying after {RESTORE_ATTEMPT_COOLDOWN} seconds."
f"Attempt {retry_state.attempt_number} failed. {BACKUP_RESTORE_MAX_ATTEMPTS - retry_state.attempt_number} attempts left. Retrying after {BACKUP_RESTORE_ATTEMPT_COOLDOWN} seconds."
),


def _restore_stop_condition(retry_state) -> bool:
def _backup_retry_stop_condition(retry_state) -> bool:
if isinstance(retry_state.outcome.exception(), BackupError):
return True
return retry_state.attempt_number >= BACKUP_RESTORE_MAX_ATTEMPTS


def _restore_retry_stop_condition(retry_state) -> bool:
if isinstance(retry_state.outcome.exception(), RestoreError):
return True
return retry_state.attempt_number >= RESTORE_MAX_ATTEMPTS
return retry_state.attempt_number >= BACKUP_RESTORE_MAX_ATTEMPTS


class MongoDBBackups(Object):
Expand Down Expand Up @@ -181,10 +191,13 @@ def _on_create_backup_action(self, event) -> None:
return

try:
event.set_results({"backup-status": "backup started"})
self.charm.unit.status = MaintenanceStatus("backup started/running")
self.charm.run_pbm_command(["backup"])
logger.info("Backup succeeded.")
backup_id = self._try_to_backup()
self.charm.unit.status = MaintenanceStatus(
f"backup started/running, backup id:'{backup_id}'"
)
self._success_action_with_info_log(
event, action, {"backup-status": f"backup started. backup id: {backup_id}"}
)
except (subprocess.CalledProcessError, ExecError, Exception) as e:
self._fail_action_with_error_log(event, action, str(e))
return
Expand Down Expand Up @@ -218,7 +231,7 @@ def _on_list_backups_action(self, event) -> None:

try:
formatted_list = self._generate_backup_list_output()
event.set_results({"backups": formatted_list})
self._success_action_with_info_log(event, action, {"backups": formatted_list})
except (subprocess.CalledProcessError, ExecError) as e:
self._fail_action_with_error_log(event, action, str(e))
return
Expand Down Expand Up @@ -272,10 +285,13 @@ def _on_restore_action(self, event) -> None:

# sometimes when we are trying to restore pmb can be resyncing, so we need to retry
try:
event.set_results({"restore-status": "restore started"})
self.charm.unit.status = MaintenanceStatus("restore started/running")
self._try_to_restore(backup_id)
logger.info("Restore succeeded.")
self.charm.unit.status = MaintenanceStatus(
f"restore started/running, backup id:'{backup_id}'"
)
self._success_action_with_info_log(
event, action, {"restore-status": "restore started"}
)
except ResyncError:
raise
except RestoreError as restore_error:
Expand Down Expand Up @@ -426,11 +442,12 @@ def _get_pbm_status(self) -> StatusBase:
return WaitingStatus("waiting for pbm to start")

try:
# TODO VM charm should implement this method
previous_pbm_status = self.charm.unit.status
pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD)
self._log_backup_restore_result(pbm_status, previous_pbm_status)
return process_pbm_status(pbm_status)
except ExecError as e:
logger.error("Failed to get pbm status.")
logger.error(f"Failed to get pbm status. {e}")
return BlockedStatus(process_pbm_error(e.stdout))
except subprocess.CalledProcessError as e:
# pbm pipes a return code of 1, but its output shows the true error code so it is
Expand Down Expand Up @@ -500,11 +517,19 @@ def _backup_from_different_cluster(self, backup_status: str) -> bool:
return re.search(REMAPPING_PATTERN, backup_status) is not None

def _try_to_restore(self, backup_id: str) -> None:
"""Try to restore cluster a backup specified by backup id.
If PBM is resyncing, the function will retry to create backup
(up to BACKUP_RESTORE_MAX_ATTEMPTS times) with BACKUP_RESTORE_ATTEMPT_COOLDOWN
time between attepts.
If PMB returen any other error, the function will raise RestoreError.
"""
for attempt in Retrying(
stop=_restore_stop_condition,
wait=wait_fixed(RESTORE_ATTEMPT_COOLDOWN),
stop=_restore_retry_stop_condition,
wait=wait_fixed(BACKUP_RESTORE_ATTEMPT_COOLDOWN),
reraise=True,
before_sleep=_restore_retry_before_sleep,
before_sleep=_backup_restore_retry_before_sleep,
):
with attempt:
try:
Expand All @@ -515,16 +540,50 @@ def _try_to_restore(self, backup_id: str) -> None:
error_message = e.output.decode("utf-8")
else:
error_message = str(e.stderr)
fail_message = f"Restore failed: {str(e)}"

if "Resync" in error_message:
raise ResyncError

fail_message = f"Restore failed: {str(e)}"
if f"backup '{backup_id}' not found" in error_message:
fail_message = f"Restore failed: Backup id '{backup_id}' does not exist in list of backups, please check list-backups for the available backup_ids."

raise RestoreError(fail_message)

def _try_to_backup(self):
"""Try to create a backup and return the backup id.
If PBM is resyncing, the function will retry to create backup
(up to BACKUP_RESTORE_MAX_ATTEMPTS times)
with BACKUP_RESTORE_ATTEMPT_COOLDOWN time between attepts.
If PMB returen any other error, the function will raise BackupError.
"""
for attempt in Retrying(
stop=_backup_retry_stop_condition,
wait=wait_fixed(BACKUP_RESTORE_ATTEMPT_COOLDOWN),
reraise=True,
before_sleep=_backup_restore_retry_before_sleep,
):
with attempt:
try:
output = self.charm.run_pbm_command(["backup"])
backup_id_match = re.search(
r"Starting backup '(?P<backup_id>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)'",
output,
)
return backup_id_match.group("backup_id") if backup_id_match else "N/A"
except (subprocess.CalledProcessError, ExecError) as e:
if type(e) == subprocess.CalledProcessError:
error_message = e.output.decode("utf-8")
else:
error_message = str(e.stderr)
if "Resync" in error_message:
raise ResyncError

fail_message = f"Backup failed: {str(e)}"

raise BackupError(fail_message)

def _remap_replicaset(self, backup_id: str) -> str:
"""Returns options for remapping a replica set during a cluster migration restore.
Expand Down Expand Up @@ -567,3 +626,42 @@ def _fail_action_with_error_log(self, event, action: str, message: str) -> None:
def _defer_action_with_info_log(self, event, action: str, message: str) -> None:
logger.info("Deferring %s: %s", action, message)
event.defer()

def _success_action_with_info_log(self, event, action: str, results: Dict[str, str]) -> None:
logger.info("%s completed successfully", action.capitalize())
event.set_results(results)

def _log_backup_restore_result(self, current_pbm_status, previous_pbm_status) -> None:
"""Logs the result of the backup/restore operation.
Expected to be called for not failed operations.
"""
operation_result = self._get_backup_restore_operation_result(
current_pbm_status, previous_pbm_status
)
logger.info(operation_result)

def _get_backup_restore_operation_result(self, current_pbm_status, previous_pbm_status) -> str:
"""Returns a string with the result of the backup/restore operation.
The function call is expected to be only for not failed operations.
The operation is taken from previous status of the unit and expected
to contain the operation type (backup/restore) and the backup id.
"""
if (
type(current_pbm_status) == type(previous_pbm_status)
and current_pbm_status.message == previous_pbm_status.message
):
return f"Operation is still in progress: '{current_pbm_status.message}'"

if (
type(previous_pbm_status) == MaintenanceStatus
and "backup id:" in previous_pbm_status.message
):
backup_id = previous_pbm_status.message.split("backup id:")[-1].strip()
if "restore" in previous_pbm_status.message:
return f"Restore from backup {backup_id} completed successfully"
if "backup" in previous_pbm_status.message:
return f"Backup {backup_id} completed successfully"

return "Unknown operation result"
4 changes: 2 additions & 2 deletions tests/integration/backup_tests/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ async def test_create_and_list_backups(ops_test: OpsTest) -> None:
# verify backup is started
action = await db_unit.run_action(action_name="create-backup")
backup_result = await action.wait()
assert backup_result.results["backup-status"] == "backup started", "backup didn't start"
assert "backup started" in backup_result.results["backup-status"], "backup didn't start"

# verify backup is present in the list of backups
# the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a
Expand Down Expand Up @@ -475,7 +475,7 @@ async def test_update_backup_password(ops_test: OpsTest) -> None:
# verify we still have connection to pbm via creating a backup
action = await leader_unit.run_action(action_name="create-backup")
backup_result = await action.wait()
assert backup_result.results["backup-status"] == "backup started", "backup didn't start"
assert "backup started" in backup_result.results["backup-status"], "backup didn't start"


# TODO remove this workaround once issue with juju secrets is fixed
Expand Down
29 changes: 25 additions & 4 deletions tests/unit/test_mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,9 @@ def test_restore_running_backup(self, pbm_command, service):
action_event = mock.Mock()
action_event.params = {"backup-id": "back-me-up"}
service.return_value = "pbm"
pbm_command.return_value = "Currently running:\n====\nSnapshot backup"
pbm_command.return_value = (
'Currently running:\n====\nSnapshot backup "2023-08-21T13:08:22Z"'
)
self.harness.add_relation(RELATION_NAME, "s3-integrator")
self.harness.charm.backups._on_restore_action(action_event)

Expand Down Expand Up @@ -660,7 +662,9 @@ def test_remap_replicaset_remap_necessary(self, run_pbm_command):
def test_get_pbm_status_backup(self, run_pbm_command, service):
"""Tests that when pbm running a backup that pbm is in maintenance state."""
service.return_value = "pbm"
run_pbm_command.return_value = "Currently running:\n====\nSnapshot backup"
run_pbm_command.return_value = (
'Currently running:\n====\nSnapshot backup "2023-08-21T13:08:22Z"'
)
self.assertTrue(
isinstance(self.harness.charm.backups._get_pbm_status(), MaintenanceStatus)
)
Expand Down Expand Up @@ -694,11 +698,12 @@ def test_backup_running_backup(self, run_pbm_command, service):
action_event = mock.Mock()
action_event.params = {}
service.return_value = "pbm"
run_pbm_command.return_value = "Currently running:\n====\nSnapshot backup"
run_pbm_command.return_value = (
'Currently running:\n====\nSnapshot backup "2023-08-21T13:08:22Z"'
)

self.harness.add_relation(RELATION_NAME, "s3-integrator")
self.harness.charm.backups._on_create_backup_action(action_event)

action_event.fail.assert_called()

@patch("charm.MongoDBCharm.get_backup_service")
Expand All @@ -720,3 +725,19 @@ def test_backup_wrong_cred(self, run_pbm_command, service):
self.harness.add_relation(RELATION_NAME, "s3-integrator")
self.harness.charm.backups._on_create_backup_action(action_event)
action_event.fail.assert_called()

def test_get_backup_restore_operation_result(self):
backup_id = "2023-08-21T13:08:22Z"
current_pbm_status = ActiveStatus("")
previous_pbm_status = MaintenanceStatus(f"backup started/running, backup id:'{backup_id}'")
operation_result = self.harness.charm.backups._get_backup_restore_operation_result(
current_pbm_status, previous_pbm_status
)
assert operation_result == f"Backup '{backup_id}' completed successfully"
previous_pbm_status = MaintenanceStatus(
f"restore started/running, backup id:'{backup_id}'"
)
operation_result = self.harness.charm.backups._get_backup_restore_operation_result(
current_pbm_status, previous_pbm_status
)
assert operation_result == f"Restore from backup '{backup_id}' completed successfully"

0 comments on commit f06a956

Please sign in to comment.