From f06a956a0aa9f9af5a40e6f2f0eea363fa9cd6fc Mon Sep 17 00:00:00 2001 From: Dmitry Ratushnyy <132273757+dmitry-ratushnyy@users.noreply.github.com> Date: Wed, 23 Aug 2023 14:27:19 +0200 Subject: [PATCH] [DPE-1918] Improve logs for backup (#189) * Improve logs for backup when backup related actions are successful * Added retries for backup and improved logs for backup\restore results * Added doc strings --- lib/charms/mongodb/v0/helpers.py | 22 ++- lib/charms/mongodb/v0/mongodb_backups.py | 140 +++++++++++++++--- .../integration/backup_tests/test_backups.py | 4 +- tests/unit/test_mongodb_backups.py | 29 +++- 4 files changed, 163 insertions(+), 32 deletions(-) diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py index 8d0938b06..1dc877da7 100644 --- a/lib/charms/mongodb/v0/helpers.py +++ b/lib/charms/mongodb/v0/helpers.py @@ -1,9 +1,9 @@ """Simple functions, which can be used in both K8s and VM charms.""" # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. - import logging import os +import re import secrets import string import subprocess @@ -236,6 +236,9 @@ def current_pbm_op(pbm_status: str) -> str: def process_pbm_status(pbm_status: str) -> StatusBase: """Parses current pbm operation and returns unit status.""" + if type(pbm_status) == bytes: + pbm_status = pbm_status.decode("utf-8") + # pbm is running resync operation if "Resync" in current_pbm_op(pbm_status): return WaitingStatus("waiting to sync s3 configurations.") @@ -244,10 +247,19 @@ def process_pbm_status(pbm_status: str) -> StatusBase: if "(none)" in current_pbm_op(pbm_status): return ActiveStatus("") - if "Snapshot backup" in current_pbm_op(pbm_status): - return MaintenanceStatus("backup started/running") + # Example of backup id: 2023-08-21T13:08:22Z + backup_match = re.search( + r'Snapshot backup "(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"', pbm_status + ) + if backup_match: + backup_id = backup_match.group("backup_id") + return MaintenanceStatus(f"backup started/running, backup id:'{backup_id}'") - if "Snapshot restore" in current_pbm_op(pbm_status): - return MaintenanceStatus("restore started/running") + restore_match = re.search( + r'Snapshot restore "(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"', pbm_status + ) + if restore_match: + backup_id = restore_match.group("backup_id") + return MaintenanceStatus(f"restore started/running, backup id:'{backup_id}'") return ActiveStatus() diff --git a/lib/charms/mongodb/v0/mongodb_backups.py b/lib/charms/mongodb/v0/mongodb_backups.py index 73b9f139f..0b5e85989 100644 --- a/lib/charms/mongodb/v0/mongodb_backups.py +++ b/lib/charms/mongodb/v0/mongodb_backups.py @@ -65,8 +65,8 @@ REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([^,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z" PBM_STATUS_CMD = ["status", "-o", "json"] MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current" -RESTORE_MAX_ATTEMPTS = 5 -RESTORE_ATTEMPT_COOLDOWN = 15 +BACKUP_RESTORE_MAX_ATTEMPTS = 5 +BACKUP_RESTORE_ATTEMPT_COOLDOWN = 15 class ResyncError(Exception): @@ -85,16 +85,26 @@ class RestoreError(Exception): """Raised when backup operation is failed.""" -def _restore_retry_before_sleep(retry_state) -> None: +class BackupError(Exception): + """Raised when backup operation is failed.""" + + +def _backup_restore_retry_before_sleep(retry_state) -> None: logger.error( - f"Attempt {retry_state.attempt_number} failed. {RESTORE_MAX_ATTEMPTS - retry_state.attempt_number} attempts left. Retrying after {RESTORE_ATTEMPT_COOLDOWN} seconds." + f"Attempt {retry_state.attempt_number} failed. {BACKUP_RESTORE_MAX_ATTEMPTS - retry_state.attempt_number} attempts left. Retrying after {BACKUP_RESTORE_ATTEMPT_COOLDOWN} seconds." ), -def _restore_stop_condition(retry_state) -> bool: +def _backup_retry_stop_condition(retry_state) -> bool: + if isinstance(retry_state.outcome.exception(), BackupError): + return True + return retry_state.attempt_number >= BACKUP_RESTORE_MAX_ATTEMPTS + + +def _restore_retry_stop_condition(retry_state) -> bool: if isinstance(retry_state.outcome.exception(), RestoreError): return True - return retry_state.attempt_number >= RESTORE_MAX_ATTEMPTS + return retry_state.attempt_number >= BACKUP_RESTORE_MAX_ATTEMPTS class MongoDBBackups(Object): @@ -181,10 +191,13 @@ def _on_create_backup_action(self, event) -> None: return try: - event.set_results({"backup-status": "backup started"}) - self.charm.unit.status = MaintenanceStatus("backup started/running") - self.charm.run_pbm_command(["backup"]) - logger.info("Backup succeeded.") + backup_id = self._try_to_backup() + self.charm.unit.status = MaintenanceStatus( + f"backup started/running, backup id:'{backup_id}'" + ) + self._success_action_with_info_log( + event, action, {"backup-status": f"backup started. backup id: {backup_id}"} + ) except (subprocess.CalledProcessError, ExecError, Exception) as e: self._fail_action_with_error_log(event, action, str(e)) return @@ -218,7 +231,7 @@ def _on_list_backups_action(self, event) -> None: try: formatted_list = self._generate_backup_list_output() - event.set_results({"backups": formatted_list}) + self._success_action_with_info_log(event, action, {"backups": formatted_list}) except (subprocess.CalledProcessError, ExecError) as e: self._fail_action_with_error_log(event, action, str(e)) return @@ -272,10 +285,13 @@ def _on_restore_action(self, event) -> None: # sometimes when we are trying to restore pmb can be resyncing, so we need to retry try: - event.set_results({"restore-status": "restore started"}) - self.charm.unit.status = MaintenanceStatus("restore started/running") self._try_to_restore(backup_id) - logger.info("Restore succeeded.") + self.charm.unit.status = MaintenanceStatus( + f"restore started/running, backup id:'{backup_id}'" + ) + self._success_action_with_info_log( + event, action, {"restore-status": "restore started"} + ) except ResyncError: raise except RestoreError as restore_error: @@ -426,11 +442,12 @@ def _get_pbm_status(self) -> StatusBase: return WaitingStatus("waiting for pbm to start") try: - # TODO VM charm should implement this method + previous_pbm_status = self.charm.unit.status pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD) + self._log_backup_restore_result(pbm_status, previous_pbm_status) return process_pbm_status(pbm_status) except ExecError as e: - logger.error("Failed to get pbm status.") + logger.error(f"Failed to get pbm status. {e}") return BlockedStatus(process_pbm_error(e.stdout)) except subprocess.CalledProcessError as e: # pbm pipes a return code of 1, but its output shows the true error code so it is @@ -500,11 +517,19 @@ def _backup_from_different_cluster(self, backup_status: str) -> bool: return re.search(REMAPPING_PATTERN, backup_status) is not None def _try_to_restore(self, backup_id: str) -> None: + """Try to restore cluster a backup specified by backup id. + + If PBM is resyncing, the function will retry to create backup + (up to BACKUP_RESTORE_MAX_ATTEMPTS times) with BACKUP_RESTORE_ATTEMPT_COOLDOWN + time between attepts. + + If PMB returen any other error, the function will raise RestoreError. + """ for attempt in Retrying( - stop=_restore_stop_condition, - wait=wait_fixed(RESTORE_ATTEMPT_COOLDOWN), + stop=_restore_retry_stop_condition, + wait=wait_fixed(BACKUP_RESTORE_ATTEMPT_COOLDOWN), reraise=True, - before_sleep=_restore_retry_before_sleep, + before_sleep=_backup_restore_retry_before_sleep, ): with attempt: try: @@ -515,16 +540,50 @@ def _try_to_restore(self, backup_id: str) -> None: error_message = e.output.decode("utf-8") else: error_message = str(e.stderr) - fail_message = f"Restore failed: {str(e)}" - if "Resync" in error_message: raise ResyncError + fail_message = f"Restore failed: {str(e)}" if f"backup '{backup_id}' not found" in error_message: fail_message = f"Restore failed: Backup id '{backup_id}' does not exist in list of backups, please check list-backups for the available backup_ids." raise RestoreError(fail_message) + def _try_to_backup(self): + """Try to create a backup and return the backup id. + + If PBM is resyncing, the function will retry to create backup + (up to BACKUP_RESTORE_MAX_ATTEMPTS times) + with BACKUP_RESTORE_ATTEMPT_COOLDOWN time between attepts. + + If PMB returen any other error, the function will raise BackupError. + """ + for attempt in Retrying( + stop=_backup_retry_stop_condition, + wait=wait_fixed(BACKUP_RESTORE_ATTEMPT_COOLDOWN), + reraise=True, + before_sleep=_backup_restore_retry_before_sleep, + ): + with attempt: + try: + output = self.charm.run_pbm_command(["backup"]) + backup_id_match = re.search( + r"Starting backup '(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)'", + output, + ) + return backup_id_match.group("backup_id") if backup_id_match else "N/A" + except (subprocess.CalledProcessError, ExecError) as e: + if type(e) == subprocess.CalledProcessError: + error_message = e.output.decode("utf-8") + else: + error_message = str(e.stderr) + if "Resync" in error_message: + raise ResyncError + + fail_message = f"Backup failed: {str(e)}" + + raise BackupError(fail_message) + def _remap_replicaset(self, backup_id: str) -> str: """Returns options for remapping a replica set during a cluster migration restore. @@ -567,3 +626,42 @@ def _fail_action_with_error_log(self, event, action: str, message: str) -> None: def _defer_action_with_info_log(self, event, action: str, message: str) -> None: logger.info("Deferring %s: %s", action, message) event.defer() + + def _success_action_with_info_log(self, event, action: str, results: Dict[str, str]) -> None: + logger.info("%s completed successfully", action.capitalize()) + event.set_results(results) + + def _log_backup_restore_result(self, current_pbm_status, previous_pbm_status) -> None: + """Logs the result of the backup/restore operation. + + Expected to be called for not failed operations. + """ + operation_result = self._get_backup_restore_operation_result( + current_pbm_status, previous_pbm_status + ) + logger.info(operation_result) + + def _get_backup_restore_operation_result(self, current_pbm_status, previous_pbm_status) -> str: + """Returns a string with the result of the backup/restore operation. + + The function call is expected to be only for not failed operations. + The operation is taken from previous status of the unit and expected + to contain the operation type (backup/restore) and the backup id. + """ + if ( + type(current_pbm_status) == type(previous_pbm_status) + and current_pbm_status.message == previous_pbm_status.message + ): + return f"Operation is still in progress: '{current_pbm_status.message}'" + + if ( + type(previous_pbm_status) == MaintenanceStatus + and "backup id:" in previous_pbm_status.message + ): + backup_id = previous_pbm_status.message.split("backup id:")[-1].strip() + if "restore" in previous_pbm_status.message: + return f"Restore from backup {backup_id} completed successfully" + if "backup" in previous_pbm_status.message: + return f"Backup {backup_id} completed successfully" + + return "Unknown operation result" diff --git a/tests/integration/backup_tests/test_backups.py b/tests/integration/backup_tests/test_backups.py index 1cfe58edf..cb807c2e6 100644 --- a/tests/integration/backup_tests/test_backups.py +++ b/tests/integration/backup_tests/test_backups.py @@ -181,7 +181,7 @@ async def test_create_and_list_backups(ops_test: OpsTest) -> None: # verify backup is started action = await db_unit.run_action(action_name="create-backup") backup_result = await action.wait() - assert backup_result.results["backup-status"] == "backup started", "backup didn't start" + assert "backup started" in backup_result.results["backup-status"], "backup didn't start" # verify backup is present in the list of backups # the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a @@ -475,7 +475,7 @@ async def test_update_backup_password(ops_test: OpsTest) -> None: # verify we still have connection to pbm via creating a backup action = await leader_unit.run_action(action_name="create-backup") backup_result = await action.wait() - assert backup_result.results["backup-status"] == "backup started", "backup didn't start" + assert "backup started" in backup_result.results["backup-status"], "backup didn't start" # TODO remove this workaround once issue with juju secrets is fixed diff --git a/tests/unit/test_mongodb_backups.py b/tests/unit/test_mongodb_backups.py index ec8869996..d099dd193 100644 --- a/tests/unit/test_mongodb_backups.py +++ b/tests/unit/test_mongodb_backups.py @@ -565,7 +565,9 @@ def test_restore_running_backup(self, pbm_command, service): action_event = mock.Mock() action_event.params = {"backup-id": "back-me-up"} service.return_value = "pbm" - pbm_command.return_value = "Currently running:\n====\nSnapshot backup" + pbm_command.return_value = ( + 'Currently running:\n====\nSnapshot backup "2023-08-21T13:08:22Z"' + ) self.harness.add_relation(RELATION_NAME, "s3-integrator") self.harness.charm.backups._on_restore_action(action_event) @@ -660,7 +662,9 @@ def test_remap_replicaset_remap_necessary(self, run_pbm_command): def test_get_pbm_status_backup(self, run_pbm_command, service): """Tests that when pbm running a backup that pbm is in maintenance state.""" service.return_value = "pbm" - run_pbm_command.return_value = "Currently running:\n====\nSnapshot backup" + run_pbm_command.return_value = ( + 'Currently running:\n====\nSnapshot backup "2023-08-21T13:08:22Z"' + ) self.assertTrue( isinstance(self.harness.charm.backups._get_pbm_status(), MaintenanceStatus) ) @@ -694,11 +698,12 @@ def test_backup_running_backup(self, run_pbm_command, service): action_event = mock.Mock() action_event.params = {} service.return_value = "pbm" - run_pbm_command.return_value = "Currently running:\n====\nSnapshot backup" + run_pbm_command.return_value = ( + 'Currently running:\n====\nSnapshot backup "2023-08-21T13:08:22Z"' + ) self.harness.add_relation(RELATION_NAME, "s3-integrator") self.harness.charm.backups._on_create_backup_action(action_event) - action_event.fail.assert_called() @patch("charm.MongoDBCharm.get_backup_service") @@ -720,3 +725,19 @@ def test_backup_wrong_cred(self, run_pbm_command, service): self.harness.add_relation(RELATION_NAME, "s3-integrator") self.harness.charm.backups._on_create_backup_action(action_event) action_event.fail.assert_called() + + def test_get_backup_restore_operation_result(self): + backup_id = "2023-08-21T13:08:22Z" + current_pbm_status = ActiveStatus("") + previous_pbm_status = MaintenanceStatus(f"backup started/running, backup id:'{backup_id}'") + operation_result = self.harness.charm.backups._get_backup_restore_operation_result( + current_pbm_status, previous_pbm_status + ) + assert operation_result == f"Backup '{backup_id}' completed successfully" + previous_pbm_status = MaintenanceStatus( + f"restore started/running, backup id:'{backup_id}'" + ) + operation_result = self.harness.charm.backups._get_backup_restore_operation_result( + current_pbm_status, previous_pbm_status + ) + assert operation_result == f"Restore from backup '{backup_id}' completed successfully"