Skip to content

Commit

Permalink
[operator] Add health checks for observability components to `garden-…
Browse files Browse the repository at this point in the history
…care` reconciler (gardener#8346)

* [cleanup] Remove observability components migration code

This reverts commit ce9b10e
of PR 7318.

* Make `CheckMonitoringControlPlane` method of `HealthChecker` reusable

* Add health checks for observability components to `garden-care` controller

* Add tests for `ObservabilityComponentsHealthy` condition

* Address PR review feedback

* Remove `gardenerVersion` from `pkg/operation/care`

* Address PR review feedback

* Address PR review feedback
  • Loading branch information
oliver-goetz authored Aug 17, 2023
1 parent ce973d6 commit c79cf43
Show file tree
Hide file tree
Showing 13 changed files with 401 additions and 151 deletions.
2 changes: 2 additions & 0 deletions charts/gardener/operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ config:
duration: 1m
- type: VirtualGardenAPIServerAvailable
duration: 1m
- type: ObservabilityComponentsHealthy
duration: 1m
networkPolicy:
concurrentSyncs: 5
# additionalNamespaceSelectors:
Expand Down
2 changes: 2 additions & 0 deletions example/operator/10-componentconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ controllers:
duration: 1m
- type: VirtualGardenAPIServerAvailable
duration: 1m
- type: ObservabilityComponentsHealthy
duration: 1m
# backupLeaderElection:
# reelectionPeriod: 5s
# etcdConnectionTimeout: 5s
Expand Down
5 changes: 0 additions & 5 deletions pkg/apis/core/v1beta1/constants/types_constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,6 @@ const (
// DeploymentNameGardenerResourceManager is a constant for the name of a Kubernetes deployment object that contains
// the gardener-resource-manager pod.
DeploymentNameGardenerResourceManager = "gardener-resource-manager"
// DeploymentNameGrafana is a constant for the name of a Kubernetes deployment object that contains the grafana pod.
DeploymentNameGrafana = "grafana"
// DeploymentNamePlutono is a constant for the name of a Kubernetes deployment object that contains the plutono pod.
DeploymentNamePlutono = "plutono"
// DeploymentNameEventLogger is a constant for the name of a Kubernetes deployment object that contains
Expand Down Expand Up @@ -192,9 +190,6 @@ const (
ETCDMain = "etcd-" + ETCDRoleMain
// ETCDEvents is a constant for the name of etcd-events Etcd object.
ETCDEvents = "etcd-" + ETCDRoleEvents
// StatefulSetNameLoki is a constant for the name of a Kubernetes stateful set object that contains
// the loki pod.
StatefulSetNameLoki = "loki"
// StatefulSetNameVali is a constant for the name of a Kubernetes stateful set object that contains
// the vali pod.
StatefulSetNameVali = "vali"
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/operator/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,8 @@ const (
VirtualComponentsHealthy gardencorev1beta1.ConditionType = "VirtualComponentsHealthy"
// VirtualGardenAPIServerAvailable is a constant for a condition type indicating that the virtual garden's API server is available.
VirtualGardenAPIServerAvailable gardencorev1beta1.ConditionType = "VirtualGardenAPIServerAvailable"
// ObservabilityComponentsHealthy is a constant for a condition type indicating the health of observability components.
ObservabilityComponentsHealthy gardencorev1beta1.ConditionType = "ObservabilityComponentsHealthy"
)

// AvailableOperationAnnotations is the set of available operation annotations for Garden resources.
Expand Down
87 changes: 35 additions & 52 deletions pkg/operation/care/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/utils/clock"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -54,18 +53,11 @@ var (
v1beta1constants.ETCDEvents,
)

requiredMonitoringSeedDeploymentsBefore171 = sets.New(
v1beta1constants.DeploymentNameGrafana,
)

requiredMonitoringSeedDeployments = sets.New(
requiredMonitoringDeployments = sets.New(
v1beta1constants.DeploymentNameKubeStateMetrics,
v1beta1constants.DeploymentNamePlutono,
)

requiredLoggingStatefulSetsBefore171 = sets.New(
v1beta1constants.StatefulSetNameLoki,
)

requiredLoggingStatefulSets = sets.New(
v1beta1constants.StatefulSetNameVali,
)
Expand All @@ -75,16 +67,6 @@ var (
)
)

// TODO(rickardsjp, istvanballok): remove in release v1.77
var versionConstraintLessThan171 *semver.Constraints

func init() {
var err error

versionConstraintLessThan171, err = semver.NewConstraint("< 1.71-0")
utilruntime.Must(err)
}

func mustGardenRoleLabelSelector(gardenRoles ...string) labels.Selector {
if len(gardenRoles) == 1 {
return labels.SelectorFromSet(map[string]string{v1beta1constants.GardenRole: gardenRoles[0]})
Expand Down Expand Up @@ -114,7 +96,6 @@ type HealthChecker struct {
managedResourceProgressingThreshold *metav1.Duration
lastOperation *gardencorev1beta1.LastOperation
kubernetesVersion *semver.Version
gardenerVersion *semver.Version
}

// NewHealthChecker creates a new health checker.
Expand All @@ -126,7 +107,6 @@ func NewHealthChecker(
managedResourceProgressingThreshold *metav1.Duration,
lastOperation *gardencorev1beta1.LastOperation,
kubernetesVersion *semver.Version,
gardenerVersion *semver.Version,
) *HealthChecker {
return &HealthChecker{
reader: reader,
Expand All @@ -136,7 +116,6 @@ func NewHealthChecker(
managedResourceProgressingThreshold: managedResourceProgressingThreshold,
lastOperation: lastOperation,
kubernetesVersion: kubernetesVersion,
gardenerVersion: gardenerVersion,
}
}

Expand Down Expand Up @@ -465,7 +444,7 @@ func shootControlPlaneNotRunningMessage(lastOperation *gardencorev1beta1.LastOpe

// This is a hack to quickly do a cloud provider specific check for the required control plane deployments.
func computeRequiredControlPlaneDeployments(shoot *gardencorev1beta1.Shoot) (sets.Set[string], error) {
requiredControlPlaneDeployments := sets.New(requiredShootControlPlaneDeployments.UnsortedList()...)
requiredControlPlaneDeployments := requiredShootControlPlaneDeployments.Clone()

if !v1beta1helper.IsWorkerless(shoot) {
requiredControlPlaneDeployments.Insert(v1beta1constants.DeploymentNameKubeScheduler)
Expand Down Expand Up @@ -493,15 +472,10 @@ func computeRequiredControlPlaneDeployments(shoot *gardencorev1beta1.Shoot) (set
return requiredControlPlaneDeployments, nil
}

func computeRequiredMonitoringSeedDeployments(shoot *gardencorev1beta1.Shoot, gardenerVersion *semver.Version) sets.Set[string] {
requiredDeployments := requiredMonitoringSeedDeployments.Clone()
// TODO(rickardsjp, istvanballok): remove in release v1.77
if versionConstraintLessThan171.Check(gardenerVersion) {
requiredDeployments = requiredMonitoringSeedDeploymentsBefore171.Clone()
}

if !v1beta1helper.IsWorkerless(shoot) {
requiredDeployments.Insert(v1beta1constants.DeploymentNameKubeStateMetrics)
func computeRequiredMonitoringSeedDeployments(shoot *gardencorev1beta1.Shoot) sets.Set[string] {
requiredDeployments := requiredMonitoringDeployments.Clone()
if v1beta1helper.IsWorkerless(shoot) {
requiredDeployments.Delete(v1beta1constants.DeploymentNameKubeStateMetrics)
}

return requiredDeployments
Expand Down Expand Up @@ -728,41 +702,36 @@ func (b *HealthChecker) CheckClusterNodes(
return nil, nil
}

// CheckMonitoringControlPlane checks whether the monitoring in the given listers are complete and healthy.
func (b *HealthChecker) CheckMonitoringControlPlane(
func (b *HealthChecker) checkMonitoringControlPlane(
ctx context.Context,
shoot *gardencorev1beta1.Shoot,
namespace string,
shootMonitoringEnabled bool,
wantsAlertmanager bool,
requiredMonitoringDeployments sets.Set[string],
requiredMonitoringStatefulSets sets.Set[string],
appsSelector labels.Selector,
condition gardencorev1beta1.Condition,
) (
*gardencorev1beta1.Condition,
error,
) {
if !shootMonitoringEnabled {
return nil, nil
}

deploymentList := &appsv1.DeploymentList{}
if err := b.reader.List(ctx, deploymentList, client.InNamespace(namespace), client.MatchingLabelsSelector{Selector: monitoringSelector}); err != nil {
if err := b.reader.List(ctx, deploymentList, client.InNamespace(namespace), client.MatchingLabelsSelector{Selector: appsSelector}); err != nil {
return nil, err
}

statefulSetList := &appsv1.StatefulSetList{}
if err := b.reader.List(ctx, statefulSetList, client.InNamespace(namespace), client.MatchingLabelsSelector{Selector: monitoringSelector}); err != nil {
if err := b.reader.List(ctx, statefulSetList, client.InNamespace(namespace), client.MatchingLabelsSelector{Selector: appsSelector}); err != nil {
return nil, err
}

if exitCondition := b.checkRequiredDeployments(condition, computeRequiredMonitoringSeedDeployments(shoot, b.gardenerVersion), deploymentList.Items); exitCondition != nil {
if exitCondition := b.checkRequiredDeployments(condition, requiredMonitoringDeployments, deploymentList.Items); exitCondition != nil {
return exitCondition, nil
}

if exitCondition := b.checkDeployments(condition, deploymentList.Items); exitCondition != nil {
return exitCondition, nil
}

if exitCondition := b.checkRequiredStatefulSets(condition, computeRequiredMonitoringStatefulSets(wantsAlertmanager), statefulSetList.Items); exitCondition != nil {
if exitCondition := b.checkRequiredStatefulSets(condition, requiredMonitoringStatefulSets, statefulSetList.Items); exitCondition != nil {
return exitCondition, nil
}
if exitCondition := b.checkStatefulSets(condition, statefulSetList.Items); exitCondition != nil {
Expand All @@ -772,6 +741,25 @@ func (b *HealthChecker) CheckMonitoringControlPlane(
return nil, nil
}

// CheckShootMonitoringControlPlane checks whether the monitoring in the given listers are complete and healthy.
func (b *HealthChecker) CheckShootMonitoringControlPlane(
ctx context.Context,
shoot *gardencorev1beta1.Shoot,
namespace string,
shootMonitoringEnabled bool,
wantsAlertmanager bool,
condition gardencorev1beta1.Condition,
) (
*gardencorev1beta1.Condition,
error,
) {
if !shootMonitoringEnabled {
return nil, nil
}

return b.checkMonitoringControlPlane(ctx, namespace, computeRequiredMonitoringSeedDeployments(shoot), computeRequiredMonitoringStatefulSets(wantsAlertmanager), monitoringSelector, condition)
}

// CheckLoggingControlPlane checks whether the logging components in the given listers are complete and healthy.
func (b *HealthChecker) CheckLoggingControlPlane(
ctx context.Context,
Expand All @@ -794,12 +782,7 @@ func (b *HealthChecker) CheckLoggingControlPlane(
return nil, err
}

// TODO(rickardsjp, istvanballok): remove in release v1.77
requiredStatefulSets := requiredLoggingStatefulSets
if versionConstraintLessThan171.Check(b.gardenerVersion) {
requiredStatefulSets = requiredLoggingStatefulSetsBefore171
}
if exitCondition := b.checkRequiredStatefulSets(condition, requiredStatefulSets, statefulSetList.Items); exitCondition != nil {
if exitCondition := b.checkRequiredStatefulSets(condition, requiredLoggingStatefulSets, statefulSetList.Items); exitCondition != nil {
return exitCondition, nil
}
if exitCondition := b.checkStatefulSets(condition, statefulSetList.Items); exitCondition != nil {
Expand Down
30 changes: 29 additions & 1 deletion pkg/operation/care/garden_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/utils/clock"
"k8s.io/utils/pointer"
Expand Down Expand Up @@ -77,6 +78,8 @@ var (
virtualGardenPrefix+v1beta1constants.ETCDMain,
virtualGardenPrefix+v1beta1constants.ETCDEvents,
)

virtualGardenMonitoringSelector = labels.SelectorFromSet(map[string]string{v1beta1constants.LabelRole: v1beta1constants.LabelMonitoring})
)

// GardenHealth contains information needed to execute health checks for garden.
Expand Down Expand Up @@ -110,6 +113,7 @@ func (h *GardenHealth) CheckGarden(
apiServerAvailability gardencorev1beta1.Condition
runtimeComponentsCondition gardencorev1beta1.Condition
virtualComponentsCondition gardencorev1beta1.Condition
observabilityCondition gardencorev1beta1.Condition
)
for _, cond := range conditions {
switch cond.Type {
Expand All @@ -119,10 +123,12 @@ func (h *GardenHealth) CheckGarden(
runtimeComponentsCondition = cond
case operatorv1alpha1.VirtualComponentsHealthy:
virtualComponentsCondition = cond
case operatorv1alpha1.ObservabilityComponentsHealthy:
observabilityCondition = cond
}
}

checker := NewHealthChecker(h.runtimeClient, h.clock, thresholdMappings, nil, nil, lastOperation, nil, nil)
checker := NewHealthChecker(h.runtimeClient, h.clock, thresholdMappings, nil, nil, lastOperation, nil)

taskFns := []flow.TaskFn{
func(ctx context.Context) error {
Expand All @@ -139,6 +145,11 @@ func (h *GardenHealth) CheckGarden(
virtualComponentsCondition = NewConditionOrError(h.clock, virtualComponentsCondition, newVirtualComponentsCondition, err)
return nil
},
func(ctx context.Context) error {
newObservabilityCondition, err := h.checkObservabilityComponents(ctx, checker, observabilityCondition)
observabilityCondition = NewConditionOrError(h.clock, observabilityCondition, newObservabilityCondition, err)
return nil
},
}

_ = flow.Parallel(taskFns...)(ctx)
Expand All @@ -147,6 +158,7 @@ func (h *GardenHealth) CheckGarden(
runtimeComponentsCondition,
virtualComponentsCondition,
apiServerAvailability,
observabilityCondition,
}
}

Expand Down Expand Up @@ -225,6 +237,22 @@ func (h *GardenHealth) checkManagedResources(
return &c, nil
}

// checkObservabilityComponents checks whether the observability components of the virtual garden control plane (Prometheus, Vali, Plutono..) are healthy.
func (h *GardenHealth) checkObservabilityComponents(ctx context.Context, checker *HealthChecker, condition gardencorev1beta1.Condition) (*gardencorev1beta1.Condition, error) {
requiredDeployments := requiredMonitoringDeployments.Clone()

if exitCondition, err := checker.checkMonitoringControlPlane(ctx, h.gardenNamespace, requiredDeployments, sets.New[string](), virtualGardenMonitoringSelector, condition); err != nil || exitCondition != nil {
return exitCondition, err
}

if exitCondition, err := checker.CheckLoggingControlPlane(ctx, h.gardenNamespace, false, false, true, condition); err != nil || exitCondition != nil {
return exitCondition, err
}

c := v1beta1helper.UpdatedConditionWithClock(h.clock, condition, gardencorev1beta1.ConditionTrue, "ObservabilityComponentsRunning", "All observability components are healthy.")
return &c, nil
}

func (h *GardenHealth) isVPAEnabled() bool {
return h.garden.Spec.RuntimeCluster.Settings != nil &&
h.garden.Spec.RuntimeCluster.Settings.VerticalPodAutoscaler != nil &&
Expand Down
Loading

0 comments on commit c79cf43

Please sign in to comment.