Skip to content

Commit d4a5230

Browse files
joshdovermergify[bot]
authored andcommitted
Add success log message after previous checkin failures (#1327)
(cherry picked from commit e614321)
1 parent 1e38c9a commit d4a5230

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

CHANGELOG.next.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@
119119
- Remove fleet event reporter and events from checkin body. {issue}993[993]
120120
- Fix unintended reset of source URI when downloading components {pull}1252[1252]
121121
- Create separate status reporter for local only events so that degraded fleet-checkins no longer affect health on successful fleet-checkins. {issue}1157[1157] {pull}1285[1285]
122+
- Add success log message after previous checkin failures {pull}1327[1327]
122123

123124
==== New features
124125

internal/pkg/agent/application/gateway/fleet/fleet_gateway.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ func (f *fleetGateway) worker() {
210210
f.statusReporter.Update(state.Failed, errMsg, nil)
211211
} else {
212212
f.statusReporter.Update(state.Healthy, "", nil)
213-
f.localReporter.Update(state.Healthy, "", nil) // we don't need to specifically set the local reporter to failed above, but it needs to be reset to healthy if a checking succeeds
213+
f.localReporter.Update(state.Healthy, "", nil) // we don't need to specifically set the local reporter to failed above, but it needs to be reset to healthy if a checkin succeeds
214214
}
215215

216216
case <-f.bgContext.Done():
@@ -280,11 +280,11 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) {
280280
// Guard if the context is stopped by a out of bound call,
281281
// this mean we are rebooting to change the log level or the system is shutting us down.
282282
for f.bgContext.Err() == nil {
283-
f.log.Debugf("Checking started")
283+
f.log.Debugf("Checkin started")
284284
resp, err := f.execute(f.bgContext)
285285
if err != nil {
286286
f.checkinFailCounter++
287-
f.log.Errorf("Could not communicate with fleet-server Checking API will retry, error: %s", err)
287+
f.log.Errorf("Could not communicate with fleet-server checkin API will retry, error: %s", err)
288288
if !f.backoff.Wait() {
289289
// Something bad has happened and we log it and we should update our current state.
290290
err := errors.New(
@@ -299,10 +299,16 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) {
299299
}
300300
if f.checkinFailCounter > 1 {
301301
f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil)
302-
f.log.Errorf("checking number %d failed: %s", f.checkinFailCounter, err.Error())
302+
f.log.Errorf("checkin number %d failed: %s", f.checkinFailCounter, err.Error())
303303
}
304304
continue
305305
}
306+
307+
if f.checkinFailCounter > 0 {
308+
// Log at same level as error logs above so subsequent successes are visible when log level is set to 'error'.
309+
f.log.Errorf("Checkin request to fleet-server succeeded after %d failures", f.checkinFailCounter)
310+
}
311+
306312
f.checkinFailCounter = 0
307313
// Request was successful, return the collected actions.
308314
return resp, nil
@@ -338,7 +344,7 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse,
338344
f.unauthCounter++
339345

340346
if f.shouldUnenroll() {
341-
f.log.Warnf("retrieved an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter)
347+
f.log.Warnf("received an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter)
342348
return &fleetapi.CheckinResponse{
343349
Actions: []fleetapi.Action{&fleetapi.ActionUnenroll{ActionID: "", ActionType: "UNENROLL", IsDetected: true}},
344350
}, nil

0 commit comments

Comments
 (0)