systemd: fix snapd systemd-unit stop progress notifications (canonica…

…l#11340) * systemd: fix snapd systemd-unit stop progress notifications The current behaviour of the systemd stop progress code is to print the list of systemd units not stopped after 250ms, and then after every 20 seconds (notice ticker), erroneously print out the same list even though some of the units may have stopped in the meantime. However, the bigger issue is that the non-mocked (real) code starts the stop sequence by making a blocking systemctl stop request and then afterwords (once all units are stopped) proceed to the progress code, which has no effect and exits immediately. The unit tests does not pick this up because the mock systemctl does not no anything, and therefore the simulation of progress works as expected because the output of the 'systemctl show' progress calls are mocked to simulate certain outcomes. Finally, the 'Stop()' function provided a 'timeout' argument which suggested the user could control when a stop attempt should timeout and fail. However, this feature did not work as the original call was blocking. Furthermore, this features does not make sense because it creates a second place where timeouts are introduced. The first place is in the systemd back-end where 'TimeoutStopSec' is optionally configured by snapd. In contrast with systemd's actual timeout mechanism described, the 'Stop()' call offered another timeout, but this mechanism is inconsistently used all over the code-base with different hard-coded timeout values, not matching the underlying service defaults to configured values. In the worse case scenario the snapd 'Stop()' would return an error due to timeout while systemd is still busy stopping a unit, because it was a longer unit timeout configure. The timeout value offered by snapd has no real purpose if carefully analysed. The following changes were made: 1. Remove the systemd Stop() timeout argument. The timeout is now automatically provided by systemdctl when it returns (blocking). This allows the timeout to always be aligned with the actual timeout 'TimeoutStopSec' provided by the unit, and takes into account systemd timeout behaviour related to 'ExecStop' commands. 2. Change the blocking systemctl stop command to async run in a go routine allowing progress tracking to continue in parallel. Note that the call to Stop() is still blocking because the progress code waits until either units are all stopped or the stop command returns. 3. Change the notice ticker to rather implement an initial notice silence period (1s). After this expires, the first notification is generated with the list of units still not stopped. This delay was chosen based on empirical tests performed on a Raspberry Pi 3 (representing slower targets) under heavy IO and CPU load (where it took up to 750ms for 4 services to stop). 4. Change the progress logic to always send a notification after the notice silence period on any change to the list of unit we are still waiting for, and always only poll the state of the remaining units. 5. Add additional unit tests to better cover the new progress and notification logic, and make existing tests more thorough. 6. Add critical sections inside the mocked systemctl functions to prevent corruption during concurrent modifications to some of the test suite logs and counters. This was needed because the Stop() is now issued from a go routine. The following example shows the expected output for 6 systemd units terminated with the 'Stop' function. The last four (test3 - test6) has a random post SIGTERM delay before the process exits. The following notifications are generated for this example after the 1s notification silence period expires: Waiting for "test3", "test4", "test5", "test6" to stop. Waiting for "test3", "test4", "test5" to stop. Waiting for "test3", "test5" to stop. Waiting for "test3" to stop. This patch is required by canonical#11113 Signed-off-by: Fred Lotter <fred.lotter@canonical.com> * typos * Improved the system stop progress logic. * Number of small review improvements * Remove redundant channel write * reviews: add thread safety into MockSystemctl * many: have a separate systemd.MockSystemctlWithDelay keep MockSystemctl signature as before as we need to introduce controlled delays only in the systemd package own tests so far Co-authored-by: Samuele Pedroni <pedronis@lucediurna.net>
ernestl · May 20, 2022 · 27b88f8 · 27b88f8
1 parent 473ec55
commit 27b88f8
Show file tree

Hide file tree

Showing 15 changed files with 477 additions and 153 deletions.
diff --git a/interfaces/systemd/backend.go b/interfaces/systemd/backend.go
@@ -25,7 +25,6 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
-	"time"
 
 	"github.com/snapcore/snapd/dirs"
 	"github.com/snapcore/snapd/interfaces"
@@ -107,7 +106,7 @@ func (b *Backend) Setup(snapInfo *snap.Info, confinement interfaces.ConfinementO
 		if !b.preseed {
 			// If we have new services here which aren't started yet the restart
 			// operation will start them.
-			if err := systemd.Restart(changed, 10*time.Second); err != nil {
+			if err := systemd.Restart(changed); err != nil {
 				logger.Noticef("cannot restart services %q: %s", changed, err)
 			}
 		}
@@ -140,7 +139,7 @@ func (b *Backend) Remove(snapName string) error {
 			logger.Noticef("cannot disable services %q: %s", removed, err)
 		}
 		if !b.preseed {
-			if err := systemd.Stop(removed, 5*time.Second); err != nil {
+			if err := systemd.Stop(removed); err != nil {
 				logger.Noticef("cannot stop services %q: %s", removed, err)
 			}
 		}
@@ -205,7 +204,7 @@ func (b *Backend) disableRemovedServices(systemd sysd.Systemd, dir, glob string,
 		}
 	}
 	if len(stopUnits) > 0 {
-		if err := systemd.Stop(stopUnits, 5*time.Second); err != nil {
+		if err := systemd.Stop(stopUnits); err != nil {
 			logger.Noticef("cannot stop services %q: %s", stopUnits, err)
 		}
 	}

diff --git a/overlord/configstate/configcore/services.go b/overlord/configstate/configcore/services.go
@@ -24,7 +24,6 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
-	"time"
 
 	"github.com/snapcore/snapd/boot"
 	"github.com/snapcore/snapd/dirs"
@@ -73,7 +72,7 @@ func switchDisableSSHService(sysd systemd.Systemd, serviceName string, disabled
 			return err
 		}
 		if opts == nil {
-			return sysd.Stop(units, 5*time.Minute)
+			return sysd.Stop(units)
 		}
 	} else {
 		err := os.Remove(sshCanary)
@@ -186,7 +185,7 @@ func switchDisableService(serviceName string, disabled bool, opts *fsOnlyContext
 		}
 		// mask triggered a reload already
 		if opts == nil {
-			return sysd.Stop(units, 5*time.Minute)
+			return sysd.Stop(units)
 		}
 	} else {
 		if err := sysd.Unmask(serviceName); err != nil {

diff --git a/overlord/configstate/configcore/swap.go b/overlord/configstate/configcore/swap.go
@@ -24,7 +24,6 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
-	"time"
 
 	"github.com/mvo5/goconfigparser"
 
@@ -146,8 +145,7 @@ func handlesystemSwapConfiguration(_ sysconfig.Device, tr config.ConfGetter, opt
 		// restart the swap service
 		sysd := systemd.NewUnderRoot(dirs.GlobalRootDir, systemd.SystemMode, &backlightSysdLogger{})
 
-		// TODO: what's an appropriate amount of time to wait here?
-		if err := sysd.Restart([]string{"swapfile.service"}, 60*time.Second); err != nil {
+		if err := sysd.Restart([]string{"swapfile.service"}); err != nil {
 			return err
 		}
 	}

diff --git a/overlord/managers_test.go b/overlord/managers_test.go
@@ -9078,14 +9078,17 @@ WantedBy=multi-user.target
 			// service also dies?
 			c.Check(cmd, DeepEquals, []string{"stop", "snap.test-snap.svc1.service"})
 			return nil, fmt.Errorf("the snap service is still having a bad day")
+		case 15:
+			c.Check(cmd, DeepEquals, []string{"show", "--property=ActiveState", "snap.test-snap.svc1.service"})
+			return nil, nil
 		default:
 			c.Errorf("unexpected call to systemctl: %+v", cmd)
 			return nil, fmt.Errorf("broken test")
 		}
 	})
 	s.AddCleanup(r)
 	// make sure that we get the expected number of systemctl calls
-	s.AddCleanup(func() { c.Assert(systemctlCalls, Equals, 14) })
+	s.AddCleanup(func() { c.Assert(systemctlCalls, Equals, 15) })
 
 	// also add the snapd snap to state which we will refresh
 	si1 := &snap.SideInfo{RealName: "snapd", Revision: snap.R(1)}

diff --git a/overlord/servicestate/quota_handlers.go b/overlord/servicestate/quota_handlers.go
@@ -22,7 +22,6 @@ package servicestate
 import (
 	"fmt"
 	"sort"
-	"time"
 
 	tomb "gopkg.in/tomb.v2"
 
@@ -529,7 +528,7 @@ func ensureSnapServicesForGroup(st *state.State, t *state.Task, grp *quota.Group
 	if _, ok := allGrps[grp.Name]; !ok {
 		// stop the quota group, then remove it
 		if !ensureOpts.Preseeding {
-			if err := systemSysd.Stop([]string{grp.SliceFileName()}, 5*time.Second); err != nil {
+			if err := systemSysd.Stop([]string{grp.SliceFileName()}); err != nil {
 				logger.Noticef("unable to stop systemd slice while removing group %q: %v", grp.Name, err)
 			}
 		}

diff --git a/overlord/servicestate/servicemgr_test.go b/overlord/servicestate/servicemgr_test.go
@@ -1294,6 +1294,9 @@ func (s *ensureSnapServiceSuite) TestEnsureSnapServicesWritesServicesFilesAndRes
 			expArgs: []string{"stop", "snap.test-snap.svc1.service"},
 			err:     fmt.Errorf("this service is still having a bad day"),
 		},
+		{
+			expArgs: []string{"show", "--property=ActiveState", "snap.test-snap.svc1.service"},
+		},
 	})
 	defer r()
 

diff --git a/systemd/emulation.go b/systemd/emulation.go
@@ -76,15 +76,15 @@ func (s *emulation) StartNoBlock(services []string) error {
 	return nil
 }
 
-func (s *emulation) Stop(services []string, timeout time.Duration) error {
+func (s *emulation) Stop(services []string) error {
 	return nil
 }
 
 func (s *emulation) Kill(service, signal, who string) error {
 	return &notImplementedError{"Kill"}
 }
 
-func (s *emulation) Restart(services []string, timeout time.Duration) error {
+func (s *emulation) Restart(services []string) error {
 	return nil
 }