From e90d6bf6b63bd07c7a3a8aa34dd2d356dbaa53ae Mon Sep 17 00:00:00 2001 From: Weidong Cai Date: Wed, 30 Aug 2023 10:13:32 +0800 Subject: [PATCH] fix: Health check from lister not apiserver (#11375) Signed-off-by: weidongcai --- docs/environment-variables.md | 1 - workflow/controller/healthz.go | 17 ++++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/environment-variables.md b/docs/environment-variables.md index d68a639525ed..c8b782a5d33f 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -33,7 +33,6 @@ most users. Environment variables may be removed at any time. | `GZIP_IMPLEMENTATION` | `string` | `PGZip` | The implementation of compression/decompression. Currently only "`PGZip`" and "`GZip`" are supported. | | `INFORMER_WRITE_BACK` | `bool` | `true` | Whether to write back to informer instead of catching up. | | `HEALTHZ_AGE` | `time.Duration` | `5m` | How old a un-reconciled workflow is to report unhealthy. | -| `HEALTHZ_LIST_LIMIT` | `int` | `200` | The maximum number of responses to return for a list call on workflows for liveness check. | | `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | `true` | Whether or not to index semaphores. | | `LEADER_ELECTION_IDENTITY` | `string` | Controller's `metadata.name` | The ID used for workflow controllers to elect a leader. | | `LEADER_ELECTION_DISABLE` | `bool` | `false` | Whether leader election should be disabled. | diff --git a/workflow/controller/healthz.go b/workflow/controller/healthz.go index 95603faa8f82..a0dcc92b8377 100644 --- a/workflow/controller/healthz.go +++ b/workflow/controller/healthz.go @@ -6,21 +6,20 @@ import ( "time" log "github.com/sirupsen/logrus" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "github.com/argoproj/argo-workflows/v3/pkg/client/listers/workflow/v1alpha1" "github.com/argoproj/argo-workflows/v3/util/env" "github.com/argoproj/argo-workflows/v3/workflow/common" ) var ( - age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute) - limit = int64(env.LookupEnvIntOr("HEALTHZ_LIST_LIMIT", 200)) + age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute) ) // https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-http-request // If we are in a state where there are any workflows that have not been reconciled in the last 2m, we've gone wrong. func (wfc *WorkflowController) Healthz(w http.ResponseWriter, r *http.Request) { - ctx := r.Context() instanceID := wfc.Config.InstanceID instanceIDSelector := func() string { if instanceID != "" { @@ -30,12 +29,16 @@ func (wfc *WorkflowController) Healthz(w http.ResponseWriter, r *http.Request) { }() labelSelector := "!" + common.LabelKeyPhase + "," + instanceIDSelector err := func() error { - // avoid problems with informers, but directly querying the API - list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector, Limit: limit}) + seletor, err := labels.Parse(labelSelector) if err != nil { return err } - for _, wf := range list.Items { + lister := v1alpha1.NewWorkflowLister(wfc.wfInformer.GetIndexer()) + list, err := lister.Workflows(wfc.managedNamespace).List(seletor) + if err != nil { + return err + } + for _, wf := range list { if time.Since(wf.GetCreationTimestamp().Time) > age { return fmt.Errorf("workflow never reconciled: %s", wf.Name) }