From ca443e0e58fefa14eb8814eca50439fdc928e6de Mon Sep 17 00:00:00 2001 From: Pier-Hugues Pellerin Date: Wed, 16 Mar 2022 16:31:12 -0400 Subject: [PATCH] Move the `context cancelled` error message into a debug statement (#187) * Move the `context cancelled` error message into a debug statement This code improve the error reporting when Elastic Agent is stopped through the pass context. This move the state reporting closer to the execution of the request. Error related to the request would still be logged as error and will change the internal state to non healthy. Fixes: #154 --- CHANGELOG.next.asciidoc | 1 + .../application/gateway/fleet/fleet_gateway.go | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 39368c63c69..8de038af86e 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -96,6 +96,7 @@ - Allow agent containers to use basic auth to create a service token. {pull}29651[29651] - Fix issue where a failing artifact verification does not remove the bad artifact. {pull}30281[30281] - Reduce Elastic Agent shut down time by stopping processes concurrently {pull}29650[29650] +- Move `context cancelled` error from fleet gateway into debug level. {pull}187[187] ==== New features diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 3ec60fb8512..967b85c4825 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -165,8 +165,6 @@ func (f *fleetGateway) worker() { // jitter to help better distribute the load from a fleet of agents. resp, err := f.doExecute() if err != nil { - f.log.Error(err) - f.statusReporter.Update(state.Failed, err.Error(), nil) continue } @@ -198,24 +196,34 @@ func (f *fleetGateway) worker() { func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) { f.backoff.Reset() + + // Guard if the context is stopped by a out of bound call, + // this mean we are rebooting to change the log level or the system is shutting us down. for f.bgContext.Err() == nil { - // TODO: wrap with timeout context f.log.Debugf("Checking started") resp, err := f.execute(f.bgContext) if err != nil { f.log.Errorf("Could not communicate with fleet-server Checking API will retry, error: %s", err) if !f.backoff.Wait() { - return nil, errors.New( + // Something bad has happened and we log it and we should update our current state. + err := errors.New( "execute retry loop was stopped", errors.TypeNetwork, errors.M(errors.MetaKeyURI, f.client.URI()), ) + + f.log.Error(err) + f.statusReporter.Update(state.Failed, err.Error(), nil) + return nil, err } continue } + // Request was successful, return the collected actions. return resp, nil } + // This mean that the next loop was cancelled because of the context, we should return the error + // but we should not log it, because we are in the process of shutting down. return nil, f.bgContext.Err() }