Skip to content

Commit

Permalink
fix: Keep health checker running when health check failed. Make healt…
Browse files Browse the repository at this point in the history
…hcheck function pure (#1779)
  • Loading branch information
basefas authored Apr 21, 2023
1 parent a414df7 commit abfacd6
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 26 deletions.
33 changes: 13 additions & 20 deletions pkg/apisix/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -506,33 +506,22 @@ func (c *cluster) UpstreamServiceRelation() UpstreamServiceRelation {

// HealthCheck implements Cluster.HealthCheck method.
func (c *cluster) HealthCheck(ctx context.Context) (err error) {
if c.cacheSyncErr != nil {
err = c.cacheSyncErr
return
}
if atomic.LoadInt32(&c.cacheState) == _cacheSyncing {
return
}

// Retry three times in a row, and exit if all of them fail.
backoff := wait.Backoff{
Duration: 5 * time.Second,
Factor: 1,
Steps: 3,
}
var lastCheckErr error

err = wait.ExponentialBackoffWithContext(ctx, backoff, func() (done bool, _ error) {
if lastCheckErr = c.healthCheck(ctx); lastCheckErr != nil {
if lastCheckErr := c.healthCheck(ctx); lastCheckErr != nil {
log.Warnf("failed to check health for cluster %s: %s, will retry", c.name, lastCheckErr)
return
}
done = true
return
})
if err != nil {
// if ErrWaitTimeout then set lastSyncErr
c.cacheSyncErr = lastCheckErr
}

return err
}

Expand All @@ -543,12 +532,16 @@ func (c *cluster) healthCheck(ctx context.Context) (err error) {
if err != nil {
return err
}
if er := conn.Close(); er != nil {
log.Warnw("failed to close tcp probe connection",
zap.Error(err),
zap.String("cluster", c.name),
)
}
defer func(conn net.Conn) {
err := conn.Close()
if err != nil {
log.Warnw("failed to close tcp probe connection",
zap.Error(err),
zap.String("cluster", c.name),
)
}
}(conn)

return
}

Expand Down
16 changes: 10 additions & 6 deletions pkg/providers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,16 +570,20 @@ func (c *Controller) checkClusterHealth(ctx context.Context, cancelFunc context.

err := c.apisix.Cluster(c.cfg.APISIX.DefaultClusterName).HealthCheck(ctx)
if err != nil {
// Finally failed health check, then give up leader.
log.Warnf("failed to check health for default cluster: %s, give up leader", err)
c.apiServer.HealthState.Lock()
c.apiServer.HealthState.Err = err
c.apiServer.HealthState.Unlock()

return
// Finally failed health check, then give up leader.
log.Warnf("failed to check health for default cluster: %s, give up leader", err)
} else {
if c.apiServer.HealthState.Err != nil {
c.apiServer.HealthState.Lock()
c.apiServer.HealthState.Err = err
c.apiServer.HealthState.Unlock()
}
log.Debugf("success check health for default cluster")
c.MetricsCollector.IncrCheckClusterHealth(c.name)
}
log.Debugf("success check health for default cluster")
c.MetricsCollector.IncrCheckClusterHealth(c.name)
}
}

Expand Down

0 comments on commit abfacd6

Please sign in to comment.