From 6acd9fd9ed1fa3bf7f77940ed0a7a5ee24fe30a3 Mon Sep 17 00:00:00 2001 From: Jason Young Date: Wed, 22 Aug 2018 10:33:26 -0700 Subject: [PATCH] Improve config validation health checking (#8118) * Improve config validation health checks (#7605) Readiness and liveliness probes were failing on some providers because GET requests were blocking for multiple seconds. Mitigate the issue by decreasing the frequency of GET (to avoid possible throttling) and increase the acceptable health check interval from 4s to 10s. Shorm term fix for https://github.com/istio/istio/issues/7586. Longer term fix requires switching to proper controller-style reconciliation. That work should be aligned with the sidecar injector. * decouple validation webhook's health checking and reconciliation loop (#7986) The validation webhook's health file updates and configuration reconciliation were invoked from the same goroutine. Delays in checking and updating the k8s configuration could result in the health file not being updated in time to pass the health checks. Use istio.io/istio/pkg/probe to decouple the two code paths. --- galley/pkg/crd/validation/webhook.go | 36 ++++++++++++------- .../charts/galley/templates/deployment.yaml | 16 ++++----- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/galley/pkg/crd/validation/webhook.go b/galley/pkg/crd/validation/webhook.go index 316644c0d8eb..3f7fed1e8dc4 100644 --- a/galley/pkg/crd/validation/webhook.go +++ b/galley/pkg/crd/validation/webhook.go @@ -17,6 +17,7 @@ package validation import ( "crypto/tls" "encoding/json" + "errors" "fmt" "io/ioutil" "net/http" @@ -26,6 +27,7 @@ import ( "github.com/ghodss/yaml" "github.com/howeyc/fsnotify" + admissionv1beta1 "k8s.io/api/admission/v1beta1" "k8s.io/api/admissionregistration/v1beta1" extensionsv1beta1 "k8s.io/api/extensions/v1beta1" @@ -41,6 +43,7 @@ import ( "istio.io/istio/pilot/pkg/config/kube/crd" "istio.io/istio/pilot/pkg/model" "istio.io/istio/pkg/log" + "istio.io/istio/pkg/probe" ) var ( @@ -51,6 +54,7 @@ var ( const ( watchDebounceDelay = 100 * time.Millisecond + reconcilePeriod = 5 * time.Second ) // WebhookParameters contains the configuration for the Istio Pilot validation @@ -117,6 +121,8 @@ type Webhook struct { // mixer validator store.BackendValidator + healthProbe *probe.Probe + healthController probe.Controller healthCheckInterval time.Duration healthCheckFile string server *http.Server @@ -187,6 +193,17 @@ func NewWebhook(p WebhookParameters) (*Webhook, error) { clientset: p.Clientset, deploymentName: p.DeploymentName, deploymentNamespace: p.DeploymentNamespace, + healthProbe: probe.NewProbe(), + healthController: probe.NewFileController(&probe.Options{ + Path: p.HealthCheckFile, + UpdateInterval: p.HealthCheckInterval, + }), + } + + if wh.healthCheckInterval != 0 && wh.healthCheckFile != "" { + wh.healthProbe.RegisterProbe(wh.healthController, "validation") + wh.healthProbe.SetAvailable(errors.New("not ready")) + wh.healthController.Start() } if galleyDeployment, err := wh.clientset.ExtensionsV1beta1().Deployments(wh.deploymentNamespace).Get(wh.deploymentName, v1.GetOptions{}); err != nil { // nolint: lll @@ -226,13 +243,6 @@ func (wh *Webhook) Run(stop <-chan struct{}) { }() defer wh.stop() - var healthC <-chan time.Time - if wh.healthCheckInterval != 0 && wh.healthCheckFile != "" { - t := time.NewTicker(wh.healthCheckInterval) - healthC = t.C - defer t.Stop() - } - // use a timer to debounce file updates var keyCertTimerC <-chan time.Time var configTimerC <-chan time.Time @@ -245,7 +255,12 @@ func (wh *Webhook) Run(stop <-chan struct{}) { var reconcileTickerC <-chan time.Time if wh.webhookConfigFile != "" { - reconcileTickerC = time.NewTicker(time.Second).C + reconcileTickerC = time.NewTicker(reconcilePeriod).C + } + + if wh.healthCheckInterval != 0 && wh.healthCheckFile != "" { + wh.healthProbe.SetAvailable(nil) + defer wh.healthController.Close() } for { @@ -278,11 +293,6 @@ func (wh *Webhook) Run(stop <-chan struct{}) { log.Errorf("keyCertWatcher error: %v", err) case err := <-wh.configWatcher.Error: log.Errorf("configWatcher error: %v", err) - case <-healthC: - content := []byte(`ok`) - if err := ioutil.WriteFile(wh.healthCheckFile, content, 0644); err != nil { - log.Errorf("Health check update of %q failed: %v", wh.healthCheckFile, err) - } case <-stop: return } diff --git a/install/kubernetes/helm/istio/charts/galley/templates/deployment.yaml b/install/kubernetes/helm/istio/charts/galley/templates/deployment.yaml index dd05a35644b4..4fbaf64f90cf 100644 --- a/install/kubernetes/helm/istio/charts/galley/templates/deployment.yaml +++ b/install/kubernetes/helm/istio/charts/galley/templates/deployment.yaml @@ -39,7 +39,7 @@ spec: - --caCertFile=/etc/istio/certs/root-cert.pem - --tlsCertFile=/etc/istio/certs/cert-chain.pem - --tlsKeyFile=/etc/istio/certs/key.pem - - --healthCheckInterval=2s + - --healthCheckInterval=1s - --healthCheckFile=/health - --webhook-config-file - /etc/istio/config/validatingwebhookconfiguration.yaml @@ -56,18 +56,18 @@ spec: - /usr/local/bin/galley - probe - --probe-path=/health - - --interval=4s - initialDelaySeconds: 4 - periodSeconds: 4 + - --interval=10s + initialDelaySeconds: 5 + periodSeconds: 5 readinessProbe: exec: command: - /usr/local/bin/galley - probe - --probe-path=/health - - --interval=4s - initialDelaySeconds: 4 - periodSeconds: 4 + - --interval=10s + initialDelaySeconds: 5 + periodSeconds: 5 - name: server image: "{{ .Values.global.hub }}/{{ .Values.image }}:{{ .Values.global.tag }}" imagePullPolicy: {{ .Values.global.imagePullPolicy }} @@ -101,4 +101,4 @@ spec: configMap: name: istio-galley-configuration affinity: - {{- include "nodeaffinity" . | indent 6 }} \ No newline at end of file + {{- include "nodeaffinity" . | indent 6 }}