Skip to content

Commit

Permalink
fix: Fix crash when Sync_Leave returns error during a deployment.
Browse files Browse the repository at this point in the history
The problem is that once we have entered the `sync_leave_download`
internal state, a deployment has started, and a deployment always has
to be ended, to properly clean up. This is fine when
`sync_leave_download` is successful and proceeds into subsequent
download states, which always end with the
`end_of_deployment_state`. But if `sync_leave_download` fails, then it
goes directly to `sync_error`, which does not know about the
deployment, and does not go to `end_of_deployment_state`.

Fix this by creating a dedicated `sync_error_download` state.

This bug happened only occasionally in the test
`test_state_scripts[Corrupted_script_version_in_etc-test_set11]` in
the integration repository, typically under load. The reason it
happened only occasionally is that the test is programmed to fail in
`Sync_Leave` only once, and inventory submission is always run
first. So by the time it got to the deployment it was always returning
success. But under load it could happen that the inventory submission
failed, which would run `Sync_Error` instead and skip `Sync_Leave`,
leaving it around for the upcoming deployment, where the bug would
then occur.

Testing this in unit tests requires supporting more than one
deployment run, which requires an extra member in the exit state. In
the spirit of trying to limit space requirements for embedded system,
I've made this member, which is only used for tests, debug-only.

Changelog: Fix crash when `Sync_Leave` returns error during a
deployment. The error message would be:
```
State machine event DeploymentStarted was not handled by any transition
```
and would happen on the next deployment following the `Sync_Leave`
error. With a long polling interval, this could cause the bug to be
latent for quite a while.

Ticket: MEN-7379

Signed-off-by: Kristian Amlie <kristian.amlie@northern.tech>
  • Loading branch information
kacf committed Aug 9, 2024
1 parent 660eeb8 commit 949a7a9
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 2 deletions.
14 changes: 14 additions & 0 deletions src/mender-update/daemon/state_machine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class StateMachine {

// Mainly for tests.
void StopAfterDeployment();
#ifndef NDEBUG
void StopAfterDeployments(int number);
#endif

private:
Context &ctx_;
Expand Down Expand Up @@ -198,6 +201,16 @@ class StateMachine {
artifact_script_path,
rootfs_script_path,
script_executor::OnError::Ignore),
sync_error_download_(
loop,
script_executor::State::Sync,
script_executor::Action::Error,
script_timeout,
retry_interval,
retry_timeout,
artifact_script_path,
rootfs_script_path,
script_executor::OnError::Ignore),
download_enter_(
loop,
script_executor::State::Download,
Expand Down Expand Up @@ -458,6 +471,7 @@ class StateMachine {
StateScriptState sync_leave_;
StateScriptState sync_leave_download_;
StateScriptState sync_error_;
StateScriptState sync_error_download_;

SaveStateScriptState download_enter_;
StateScriptState download_leave_;
Expand Down
18 changes: 17 additions & 1 deletion src/mender-update/daemon/state_machine/state_machine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@ StateMachine::StateMachine(Context &ctx, events::EventLoop &event_loop) :
main_states_.AddTransition(ss.sync_leave_, se::Failure, ss.sync_error_, tf::Immediate);

main_states_.AddTransition(ss.sync_leave_download_, se::Success, send_download_status_state_, tf::Immediate);
main_states_.AddTransition(ss.sync_leave_download_, se::Failure, ss.sync_error_, tf::Immediate);
main_states_.AddTransition(ss.sync_leave_download_, se::Failure, ss.sync_error_download_, tf::Immediate);

main_states_.AddTransition(ss.sync_error_download_, se::Success, end_of_deployment_state_, tf::Immediate);
main_states_.AddTransition(ss.sync_error_download_, se::Failure, end_of_deployment_state_, tf::Immediate);

// Cannot fail due to FailureMode::Ignore.
main_states_.AddTransition(send_download_status_state_, se::Success, ss.download_enter_, tf::Immediate);
Expand Down Expand Up @@ -454,6 +457,19 @@ void StateMachine::StopAfterDeployment() {
sm::TransitionFlag::Immediate);
}

#ifndef NDEBUG
void StateMachine::StopAfterDeployments(int number) {
exit_state_.ExitAfter(number);
main_states_.AddTransition(
end_of_deployment_state_, StateEvent::Success, exit_state_, sm::TransitionFlag::Immediate);
main_states_.AddTransition(
exit_state_,
StateEvent::Success,
state_scripts_.idle_enter_,
sm::TransitionFlag::Immediate);
}
#endif

} // namespace daemon
} // namespace update
} // namespace mender
8 changes: 8 additions & 0 deletions src/mender-update/daemon/states.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,15 @@ ExitState::ExitState(events::EventLoop &event_loop) :
}

void ExitState::OnEnter(Context &ctx, sm::EventPoster<StateEvent> &poster) {
#ifndef NDEBUG
if (--iterations_left_ <= 0) {
event_loop_.Stop();
} else {
poster.PostEvent(StateEvent::Success);
}
#else
event_loop_.Stop();
#endif
}

namespace deployment_tracking {
Expand Down
10 changes: 10 additions & 0 deletions src/mender-update/daemon/states.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,18 @@ class ExitState : virtual public StateType {

error::Error exit_error;

#ifndef NDEBUG
void ExitAfter(int number) {
iterations_left_ = number;
}
#endif

private:
events::EventLoop &event_loop_;

#ifndef NDEBUG
int iterations_left_ {1};
#endif
};


Expand Down
75 changes: 74 additions & 1 deletion tests/src/mender-update/daemon/state_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ struct StateTransitionsTestCase {
bool generate_idle_sync_scripts {false};
// Set it to the string that the database should contain.
string update_control_string;
int stop_after_n_deployments {1};
};


Expand Down Expand Up @@ -367,6 +368,65 @@ vector<StateTransitionsTestCase> idle_and_sync_test_cases {
.error_states = {"Sync_Leave_00"},
.generate_idle_sync_scripts = true,
},

StateTransitionsTestCase {
.case_name = "Sync_Leave_Error_multiple_deployments",
.state_chain =
{
"Idle_Enter_00",
"Idle_Leave_00",
"Sync_Enter_00", // <- Only fails the first time, during inventory
// Start of inventory.
"Sync_Error_00",
// End of inventory.
"Idle_Enter_00",
"Idle_Leave_00",
"Sync_Enter_00",
// Start of deployment.
"Sync_Leave_00", // <- Only fails the second time, during deployment
"Sync_Error_00",
// End of deployment.
"Idle_Enter_00",
"Idle_Leave_00",
// Start of inventory.
"Sync_Enter_00",
"Sync_Leave_00",
// End of inventory.
"Idle_Enter_00",
"Idle_Leave_00",
// Start of deployment.
"Sync_Enter_00",
"Sync_Leave_00",
"Download_Enter_00",
"ProvidePayloadFileSizes",
"Download",
"Download_Leave_00",
"ArtifactInstall_Enter_00",
"ArtifactInstall",
"ArtifactInstall_Leave_00",
"ArtifactReboot_Enter_00",
"ArtifactReboot",
"ArtifactVerifyReboot",
"ArtifactReboot_Leave_00",
"ArtifactCommit_Enter_00",
"ArtifactCommit",
"ArtifactCommit_Leave_00",
"Cleanup",
// End of deployment.
},
.status_log =
{
"downloading",
"installing",
"rebooting",
"installing",
"success",
},
.install_outcome = InstallOutcome::SuccessfulInstall,
.error_states = {"Sync_Enter_00", "Sync_Leave_00"},
.generate_idle_sync_scripts = true,
.stop_after_n_deployments = 2,
},
};

vector<StateTransitionsTestCase> GenerateStateTransitionsTestCases() {
Expand Down Expand Up @@ -3588,6 +3648,7 @@ void StateTransitionsTestSubProcess(
// exit handlers which should not be invoked while these objects are still alive.
{
conf::MenderConfig config {};
config.update_poll_interval_seconds = 1;
config.module_timeout_seconds = 2;
config.paths.SetDataStore(tmpdir);
config.paths.SetArtScriptsPath(path::Join(tmpdir, "artifact-scripts"));
Expand Down Expand Up @@ -3648,7 +3709,13 @@ void StateTransitionsTestSubProcess(
test.GetParam().fail_status_report_status,
test.GetParam().fail_status_aborted);

state_machine.StopAfterDeployment();
if (test.GetParam().stop_after_n_deployments > 1) {
#ifndef NDEBUG
state_machine.StopAfterDeployments(test.GetParam().stop_after_n_deployments);
#endif
} else {
state_machine.StopAfterDeployment();
}
err = state_machine.Run();
ASSERT_IN_DEATH_TEST(err == error::NoError) << err.String();
}
Expand Down Expand Up @@ -3686,6 +3753,12 @@ TEST_P(StateDeathTest, StateTransitionsTest) {
// information.
GTEST_FLAG_SET(death_test_style, "fast");

#ifdef NDEBUG
if (GetParam().stop_after_n_deployments > 1) {
GTEST_SKIP() << "Stopping after N deployments requires debug mode";
}
#endif

{
ofstream f(path::Join(tmpdir.Path(), "device_type"));
f << "device_type=test-type\n";
Expand Down

0 comments on commit 949a7a9

Please sign in to comment.