diff --git a/Makefile b/Makefile index adbfa6eb8df..e88b6cc5424 100644 --- a/Makefile +++ b/Makefile @@ -305,8 +305,11 @@ kind-ha-multi-zone-down: $(KIND) kind-operator-up: $(KIND) $(KUBECTL) $(HELM) ./hack/kind-up.sh --cluster-name gardener-operator-local --environment $(KIND_ENV) --path-kubeconfig $(REPO_ROOT)/example/gardener-local/kind/operator/kubeconfig --path-cluster-values $(REPO_ROOT)/example/gardener-local/kind/operator/values.yaml + mkdir -p $(REPO_ROOT)/dev/local-backupbuckets/gardener-operator kind-operator-down: $(KIND) ./hack/kind-down.sh --cluster-name gardener-operator-local --path-kubeconfig $(REPO_ROOT)/example/gardener-local/kind/operator/kubeconfig + # We need root privileges to clean the backup bucket directory, see https://github.com/gardener/gardener/issues/6752 + docker run --user root:root -v $(REPO_ROOT)/dev/local-backupbuckets:/dev/local-backupbuckets alpine rm -rf /dev/local-backupbuckets/gardener-operator # speed-up skaffold deployments by building all images concurrently export SKAFFOLD_BUILD_CONCURRENCY = 0 diff --git a/charts/gardener/operator/templates/customresouredefintion.yaml b/charts/gardener/operator/templates/customresouredefintion.yaml index 532e082586e..03db95c93a1 100644 --- a/charts/gardener/operator/templates/customresouredefintion.yaml +++ b/charts/gardener/operator/templates/customresouredefintion.yaml @@ -84,8 +84,129 @@ spec: required: - provider type: object + virtualCluster: + description: VirtualCluster contains configuration for the virtual + cluster. + properties: + etcd: + description: ETCD contains configuration for the etcds of the + virtual garden cluster. + properties: + events: + description: Events contains configuration for the events + etcd. + properties: + storage: + description: Storage contains storage configuration. + properties: + capacity: + anyOf: + - type: integer + - type: string + default: 10Gi + description: Capacity is the storage capacity for + the volumes. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + className: + description: ClassName is the name of a storage class. + type: string + type: object + type: object + main: + description: Main contains configuration for the main etcd. + properties: + backup: + description: Backup contains the object store configuration + for backups for the virtual garden etcd. + properties: + bucketName: + description: BucketName is the name of the backup + bucket. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + provider: + description: Provider is a provider name. This field + is immutable. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + secretRef: + description: SecretRef is a reference to a Secret + object containing the cloud provider credentials + for the object store where backups should be stored. + It should have enough privileges to manipulate the + objects as well as buckets. + properties: + name: + description: name is unique within a namespace + to reference a secret resource. + type: string + namespace: + description: namespace defines the space within + which the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + required: + - bucketName + - provider + - secretRef + type: object + storage: + description: Storage contains storage configuration. + properties: + capacity: + anyOf: + - type: integer + - type: string + default: 10Gi + description: Capacity is the storage capacity for + the volumes. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + className: + description: ClassName is the name of a storage class. + type: string + type: object + type: object + type: object + maintenance: + description: Maintenance contains information about the time window + for maintenance operations. + properties: + timeWindow: + description: TimeWindow contains information about the time + window for maintenance operations. + properties: + begin: + description: Begin is the beginning of the time window + in the format HHMMSS+ZONE, e.g. "220000+0100". If not + present, a random value will be computed. + pattern: ([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00 + type: string + end: + description: End is the end of the time window in the + format HHMMSS+ZONE, e.g. "220000+0100". If not present, + the value will be computed based on the "Begin" value. + pattern: ([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00 + type: string + required: + - begin + - end + type: object + required: + - timeWindow + type: object + required: + - maintenance + type: object required: - runtimeCluster + - virtualCluster type: object status: description: Status contains the status of this garden. diff --git a/charts/gardener/operator/templates/role.yaml b/charts/gardener/operator/templates/role.yaml index 11d4b8e4519..0073bcd22df 100644 --- a/charts/gardener/operator/templates/role.yaml +++ b/charts/gardener/operator/templates/role.yaml @@ -143,3 +143,35 @@ rules: - watch - patch - update +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - create + - delete + - get + - list + - watch + - patch + - update +- apiGroups: + - autoscaling.k8s.io + resources: + - hvpas + verbs: + - create + - get + - list + - watch +- apiGroups: + - autoscaling.k8s.io + resources: + - hvpas + resourceNames: + - virtual-garden-etcd-events + - virtual-garden-etcd-main + verbs: + - delete + - patch + - update diff --git a/docs/api-reference/operator.md b/docs/api-reference/operator.md index 457f3456d87..d956c9a1b75 100644 --- a/docs/api-reference/operator.md +++ b/docs/api-reference/operator.md @@ -10,6 +10,188 @@

Resource Types: +

Backup +

+

+(Appears on: +ETCDMain) +

+

+

Backup contains the object store configuration for backups for the virtual garden etcd.

+

+ + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+provider
+ +string + +
+

Provider is a provider name. This field is immutable.

+
+bucketName
+ +string + +
+

BucketName is the name of the backup bucket.

+
+secretRef
+ + +Kubernetes core/v1.SecretReference + + +
+

SecretRef is a reference to a Secret object containing the cloud provider credentials for the object store where +backups should be stored. It should have enough privileges to manipulate the objects as well as buckets.

+
+

ETCD +

+

+(Appears on: +VirtualCluster) +

+

+

ETCD contains configuration for the etcds of the virtual garden cluster.

+

+ + + + + + + + + + + + + + + + + +
FieldDescription
+main
+ + +ETCDMain + + +
+(Optional) +

Main contains configuration for the main etcd.

+
+events
+ + +ETCDEvents + + +
+(Optional) +

Events contains configuration for the events etcd.

+
+

ETCDEvents +

+

+(Appears on: +ETCD) +

+

+

ETCDEvents contains configuration for the events etcd.

+

+ + + + + + + + + + + + + +
FieldDescription
+storage
+ + +Storage + + +
+(Optional) +

Storage contains storage configuration.

+
+

ETCDMain +

+

+(Appears on: +ETCD) +

+

+

ETCDMain contains configuration for the main etcd.

+

+ + + + + + + + + + + + + + + + + +
FieldDescription
+backup
+ + +Backup + + +
+(Optional) +

Backup contains the object store configuration for backups for the virtual garden etcd.

+
+storage
+ + +Storage + + +
+(Optional) +

Storage contains storage configuration.

+

Garden

@@ -65,6 +247,19 @@ RuntimeCluster

RuntimeCluster contains configuration for the runtime cluster.

+ + +virtualCluster
+ + +VirtualCluster + + + + +

VirtualCluster contains configuration for the virtual cluster.

+ + @@ -113,6 +308,19 @@ RuntimeCluster

RuntimeCluster contains configuration for the runtime cluster.

+ + +virtualCluster
+ + +VirtualCluster + + + + +

VirtualCluster contains configuration for the virtual cluster.

+ +

GardenStatus @@ -168,6 +376,36 @@ int64 +

Maintenance +

+

+(Appears on: +VirtualCluster) +

+

+

Maintenance contains information about the time window for maintenance operations.

+

+ + + + + + + + + + + + + +
FieldDescription
+timeWindow
+ +github.com/gardener/gardener/pkg/apis/core/v1beta1.MaintenanceTimeWindow + +
+

TimeWindow contains information about the time window for maintenance operations.

+

Provider

@@ -314,6 +552,96 @@ cluster.

+

Storage +

+

+(Appears on: +ETCDEvents, +ETCDMain) +

+

+

Storage contains storage configuration.

+

+ + + + + + + + + + + + + + + + + +
FieldDescription
+capacity
+ +k8s.io/apimachinery/pkg/api/resource.Quantity + +
+(Optional) +

Capacity is the storage capacity for the volumes.

+
+className
+ +string + +
+(Optional) +

ClassName is the name of a storage class.

+
+

VirtualCluster +

+

+(Appears on: +GardenSpec) +

+

+

VirtualCluster contains configuration for the virtual cluster.

+

+ + + + + + + + + + + + + + + + + +
FieldDescription
+etcd
+ + +ETCD + + +
+(Optional) +

ETCD contains configuration for the etcds of the virtual garden cluster.

+
+maintenance
+ + +Maintenance + + +
+

Maintenance contains information about the time window for maintenance operations.

+

Generated with gen-crd-api-reference-docs diff --git a/docs/concepts/etcd.md b/docs/concepts/etcd.md index 2d8d074bee9..92431dbe8c5 100644 --- a/docs/concepts/etcd.md +++ b/docs/concepts/etcd.md @@ -1,63 +1,62 @@ -# etcd - Key-Value Store for Kubernetes +# ETCD - Key-Value Store for Kubernetes [etcd](https://etcd.io/) is a strongly consistent key-value store and the most prevalent choice for the Kubernetes persistence layer. All API cluster objects like `Pod`s, `Deployment`s, `Secret`s, etc. are stored in `etcd` which makes it an essential part of a [Kubernetes control plane](https://kubernetes.io/docs/concepts/overview/components/#control-plane-components). -## Shoot cluster persistence +## Garden or Shoot Cluster Persistence -Each shoot cluster gets its very own persistence for the control plane. It runs in the shoot namespace on the respective -seed cluster. Concretely, there are two etcd instances per shoot cluster which the `Kube-Apiserver` is configured -to use in the following way: +Each garden or shoot cluster gets its very own persistence for the control plane. +It runs in the shoot namespace on the respective seed cluster (or in the `garden` namespace in the garden cluster, respectively). +Concretely, there are two etcd instances per shoot cluster which the `kube-apiserver` is configured to use in the following way: -* etcd-main +* `etcd-main` -A store that contains all "cluster critical" or "long-term" objects. These object kinds are typically considered -for a backup to prevent any data loss. +A store that contains all "cluster critical" or "long-term" objects. +These object kinds are typically considered for a backup to prevent any data loss. -* etcd-events +* `etcd-events` -A store that contains all `Event` objects (`events.k8s.io`) of a cluster. `Events` have usually a short retention -period, occur frequently but are not essential for a disaster recovery. +A store that contains all `Event` objects (`events.k8s.io`) of a cluster. +`Events` have usually a short retention period, occur frequently but are not essential for a disaster recovery. -The setup above prevents both, the critical `etcd-main` is not flooded by Kubernetes `Events` as well as backup space is -not occupied by non-critical data. This segmentation saves time and resources. +The setup above prevents both, the critical `etcd-main` is not flooded by Kubernetes `Events` as well as backup space is not occupied by non-critical data. +This separation saves time and resources. -## etcd Operator +## ETCD Operator -Configuring, maintaining and health-checking `etcd` is outsourced to a dedicated operator called [ETCD Druid](https://github.com/gardener/etcd-druid/). -When [Gardenlet](../concepts/gardenlet.md) reconciles a `Shoot` resource, it creates or updates an [Etcd](https://github.com/gardener/etcd-druid/blob/1d427e9167adac1476d1847c0e265c2c09d6bc62/config/samples/druid_v1alpha1_etcd.yaml) -resources in the seed cluster, containing necessary information (backup information, defragmentation schedule, resources, etc.) `etcd-druid` -needs to manage the lifecycle of the desired etcd instance (today `main` or `events`). Likewise, when the shoot is deleted, -Gardenlet deletes the `Etcd` resource and [ETCD Druid](https://github.com/gardener/etcd-druid/) takes care about cleaning up -all related objects, e.g. the backing `StatefulSet`. +Configuring, maintaining and health-checking ETCD is outsourced to a dedicated operator called [ETCD Druid](https://github.com/gardener/etcd-druid/). +When [`gardenlet`](gardenlet.md) reconciles a `Shoot` resource or [`gardener-operator`](operator.md) reconciles a `Garden` resource, they manage an [`Etcd`](https://github.com/gardener/etcd-druid/blob/1d427e9167adac1476d1847c0e265c2c09d6bc62/config/samples/druid_v1alpha1_etcd.yaml) resource in the seed or garden cluster, containing necessary information (backup information, defragmentation schedule, resources, etc.). +`etcd-druid` needs to manage the lifecycle of the desired ETCD instance (today `main` or `events`). +Likewise, when the `Shoot` or `Garden` is deleted, `gardenlet` or `gardener-operator` delete the `Etcd` resources and [ETCD Druid](https://github.com/gardener/etcd-druid/) takes care about cleaning up all related objects, e.g. the backing `StatefulSet`s. ## Autoscaling -Gardenlet maintains [HVPA](https://github.com/gardener/hvpa-controller/blob/master/config/samples/autoscaling_v1alpha1_hvpa.yaml) -objects for etcd `StatefulSet`s if the corresponding [feature gate](../deployment/feature_gates.md) is enabled. This enables -a vertical scaling for `etcd`. Downscaling is handled more pessimistic to prevent many subsequent `etcd` restarts. Thus, -for `production` and `infrastructure` clusters downscaling is deactivated and for all other clusters lower advertised requests/limits are only -applied during a shoot's maintenance time window. +Gardenlet maintains [`HVPA`](https://github.com/gardener/hvpa-controller/blob/master/config/samples/autoscaling_v1alpha1_hvpa.yaml) objects for etcd `StatefulSet`s if the corresponding [feature gate](../deployment/feature_gates.md) is enabled. +This enables a vertical scaling for ETCD. +Downscaling is handled more pessimistic to prevent many subsequent ETCD restarts. +Thus, for `production` and `infrastructure` shoot clusters (or all garden clusters), downscaling is deactivated for the main ETCD. +For all other shoot clusters, lower advertised requests/limits are only applied during a shoot's maintenance time window. ## Backup -If `Seed`s specify backups for etcd ([example](https://github.com/gardener/gardener/blob/e9bf88a7a091a8cf8c495bef298bdada17a03c7f/example/50-seed.yaml#L19)), -then Gardener and the respective [provider extensions](../extensions/overview.md) are responsible for creating a bucket -on the cloud provider's side (modelled through [BackupBucket resource](../extensions/backupbucket.md)). The bucket stores -backups of shoots scheduled on that seed. Furthermore, Gardener creates a [BackupEntry](../extensions/backupentry.md) -which subdivides the bucket and thus makes it possible to store backups of multiple shoot clusters. +If `Seed`s specify backups for ETCD ([example](../../example/50-seed.yaml)), then Gardener and the respective [provider extensions](../extensions/overview.md) are responsible for creating a bucket on the cloud provider's side (modelled through [BackupBucket resource](../extensions/backupbucket.md)). +The bucket stores backups of `Shoot`s scheduled on that `Seed`. +Furthermore, Gardener creates a [BackupEntry](../extensions/backupentry.md) which subdivides the bucket and thus makes it possible to store backups of multiple shoot clusters. -The `etcd-main` instance itself is configured to run with a special backup-restore _sidecar_. It takes care about regularly -backing up etcd data and restoring it in case of data loss. More information can be found on the component's GitHub -page https://github.com/gardener/etcd-backup-restore. +How long backups are stored in the bucket after a shoot has been deleted, depends on the configured _retention period_ in the `Seed` resource. +Please see this [example configuration](https://github.com/gardener/gardener/blob/849cd857d0d20e5dde26b9740ca2814603a56dfd/example/20-componentconfig-gardenlet.yaml#L20) for more information. -How long backups are stored in the bucket after a shoot has been deleted, depends on the configured _retention period_ in the -`Seed` resource. Please see this [example configuration](https://github.com/gardener/gardener/blob/849cd857d0d20e5dde26b9740ca2814603a56dfd/example/20-componentconfig-gardenlet.yaml#L20) for more information. +For `Garden`s specifying backups for ETCD ([example](../../example/operator/20-garden.yaml)), the bucket must be pre-created externally and provided via the `Garden` specification. + +Both ETCD instances are configured to run with a special backup-restore _sidecar_. +It takes care about regularly backing up etcd data and restoring it in case of data loss (in main ETCD only). +The sidecar also performs defragmentation and other house-keeping tasks. +More information can be found on the [component's GitHub repository](https://github.com/gardener/etcd-backup-restore). ## Housekeeping -[etcd maintenance tasks](https://etcd.io/docs/v3.3/op-guide/maintenance/) must be performed from time to time in order -to re-gain database storage and to ensure the system's reliability. The [backup-restore](https://github.com/gardener/etcd-backup-restore) -_sidecar_ takes care about this job as well. Gardener chooses a random time **within the shoot's maintenance time** to -schedule these tasks. \ No newline at end of file +[ETCD maintenance tasks](https://etcd.io/docs/v3.3/op-guide/maintenance/) must be performed from time to time in order to re-gain database storage and to ensure the system's reliability. +The [backup-restore](https://github.com/gardener/etcd-backup-restore) _sidecar_ takes care about this job as well. + +For both `Shoot`s and `Garden`s, a random time **within the shoot's maintenance time** is chosen for scheduling these tasks. diff --git a/docs/concepts/operator.md b/docs/concepts/operator.md index e5a83fa6f97..56e969aaf7b 100644 --- a/docs/concepts/operator.md +++ b/docs/concepts/operator.md @@ -86,4 +86,13 @@ Other system components are: - HVPA controller (when `HVPA` feature gate is enabled) - ETCD Druid +As soon as all system components are up, the reconciler deploys the virtual garden cluster. +It comprises out of two ETCDs (one "main" etcd, one "events" etcd) which are managed by ETCD Druid via `druid.gardener.cloud/v1alpha1.Etcd` custom resources. +The whole management works similar to how it works for `Shoot`s, so you can take a look at [this document](etcd.md) for more information in general. + +The virtual garden control plane components are: + +- `virtual-garden-etcd-main` +- `virtual-garden-etcd-events` + The controller maintains the `Reconciled` condition which indicates the status of an operation. diff --git a/docs/development/priority-classes.md b/docs/development/priority-classes.md index 922f7e0cfd1..c994b757652 100644 --- a/docs/development/priority-classes.md +++ b/docs/development/priority-classes.md @@ -22,7 +22,7 @@ When using the `gardener-operator` for managing the garden runtime and virtual c | Name | Priority | Associated Components (Examples) | |---------------------------------- |-----------|------------------------------------------------------------- | | `gardener-garden-system-critical` | 999999550 | `gardener-operator`, `gardener-resource-manager` | -| `gardener-garden-system-500` | 999999500 | | +| `gardener-garden-system-500` | 999999500 | `virtual-garden-etcd-events`, `virtual-garden-etcd-main` | | `gardener-garden-system-400` | 999999400 | | | `gardener-garden-system-300` | 999999300 | `vpa-admission-controller`, `etcd-druid` | | `gardener-garden-system-200` | 999999200 | `vpa-recommender`, `vpa-updater`, `hvpa-controller` | diff --git a/example/gardener-local/kind/cluster/templates/_extra_mounts.tpl b/example/gardener-local/kind/cluster/templates/_extra_mounts.tpl index 1e11c4754af..3d1c7564e3a 100644 --- a/example/gardener-local/kind/cluster/templates/_extra_mounts.tpl +++ b/example/gardener-local/kind/cluster/templates/_extra_mounts.tpl @@ -6,8 +6,8 @@ {{- end }} {{- end -}} -{{- define "extraMounts.gardener.seed.backupBuckets" -}} -{{- if .Values.gardener.seed.deployed -}} +{{- define "extraMounts.backupBucket" -}} +{{- if .Values.backupBucket.deployed -}} - hostPath: dev/local-backupbuckets containerPath: /etc/gardener/local-backupbuckets {{- end -}} diff --git a/example/gardener-local/kind/cluster/templates/cluster.yaml b/example/gardener-local/kind/cluster/templates/cluster.yaml index 3b719136981..da1595f46e5 100644 --- a/example/gardener-local/kind/cluster/templates/cluster.yaml +++ b/example/gardener-local/kind/cluster/templates/cluster.yaml @@ -12,7 +12,7 @@ nodes: {{ include "extraPortMappings.registry" . | indent 2 }} extraMounts: {{ include "extraMounts.gardener.controlPlane" . | indent 2 }} -{{ include "extraMounts.gardener.seed.backupBuckets" . | indent 2 }} +{{ include "extraMounts.backupBucket" . | indent 2 }} {{ include "extraMounts.registry" . | indent 2 }} kubeadmConfigPatches: {{ include "kubeadmConfigPatches" . | indent 2 }} @@ -24,7 +24,7 @@ nodes: topology.kubernetes.io/zone: {{ $worker.zone | quote }} extraMounts: {{ include "extraMounts.gardener.controlPlane" $ | indent 2 }} -{{ include "extraMounts.gardener.seed.backupBuckets" $ | indent 2 }} +{{ include "extraMounts.backupBucket" $ | indent 2 }} kubeadmConfigPatches: {{ include "kubeadmConfigPatches" $ | indent 2 }} {{- end }} diff --git a/example/gardener-local/kind/cluster/values.yaml b/example/gardener-local/kind/cluster/values.yaml index 0da203b646d..61980a3afb7 100644 --- a/example/gardener-local/kind/cluster/values.yaml +++ b/example/gardener-local/kind/cluster/values.yaml @@ -9,6 +9,9 @@ gardener: istio: listenAddress: 127.0.0.1 +backupBucket: + deployed: true + registry: deployed: true hostname: gardener-local-control-plane diff --git a/example/operator/10-crd-operator.gardener.cloud_gardens.yaml b/example/operator/10-crd-operator.gardener.cloud_gardens.yaml index 532e082586e..03db95c93a1 100644 --- a/example/operator/10-crd-operator.gardener.cloud_gardens.yaml +++ b/example/operator/10-crd-operator.gardener.cloud_gardens.yaml @@ -84,8 +84,129 @@ spec: required: - provider type: object + virtualCluster: + description: VirtualCluster contains configuration for the virtual + cluster. + properties: + etcd: + description: ETCD contains configuration for the etcds of the + virtual garden cluster. + properties: + events: + description: Events contains configuration for the events + etcd. + properties: + storage: + description: Storage contains storage configuration. + properties: + capacity: + anyOf: + - type: integer + - type: string + default: 10Gi + description: Capacity is the storage capacity for + the volumes. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + className: + description: ClassName is the name of a storage class. + type: string + type: object + type: object + main: + description: Main contains configuration for the main etcd. + properties: + backup: + description: Backup contains the object store configuration + for backups for the virtual garden etcd. + properties: + bucketName: + description: BucketName is the name of the backup + bucket. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + provider: + description: Provider is a provider name. This field + is immutable. + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + secretRef: + description: SecretRef is a reference to a Secret + object containing the cloud provider credentials + for the object store where backups should be stored. + It should have enough privileges to manipulate the + objects as well as buckets. + properties: + name: + description: name is unique within a namespace + to reference a secret resource. + type: string + namespace: + description: namespace defines the space within + which the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + required: + - bucketName + - provider + - secretRef + type: object + storage: + description: Storage contains storage configuration. + properties: + capacity: + anyOf: + - type: integer + - type: string + default: 10Gi + description: Capacity is the storage capacity for + the volumes. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + className: + description: ClassName is the name of a storage class. + type: string + type: object + type: object + type: object + maintenance: + description: Maintenance contains information about the time window + for maintenance operations. + properties: + timeWindow: + description: TimeWindow contains information about the time + window for maintenance operations. + properties: + begin: + description: Begin is the beginning of the time window + in the format HHMMSS+ZONE, e.g. "220000+0100". If not + present, a random value will be computed. + pattern: ([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00 + type: string + end: + description: End is the end of the time window in the + format HHMMSS+ZONE, e.g. "220000+0100". If not present, + the value will be computed based on the "Begin" value. + pattern: ([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00 + type: string + required: + - begin + - end + type: object + required: + - timeWindow + type: object + required: + - maintenance + type: object required: - runtimeCluster + - virtualCluster type: object status: description: Status contains the status of this garden. diff --git a/example/operator/20-garden.yaml b/example/operator/20-garden.yaml index da80836d5dd..6abf4fb855f 100644 --- a/example/operator/20-garden.yaml +++ b/example/operator/20-garden.yaml @@ -1,4 +1,13 @@ --- +apiVersion: v1 +kind: Secret +metadata: + name: virtual-garden-etcd-main-backup-local + namespace: garden +type: Opaque +stringData: + hostPath: "/etc/gardener/local-backupbuckets" +--- apiVersion: operator.gardener.cloud/v1alpha1 kind: Garden metadata: @@ -11,3 +20,23 @@ spec: settings: verticalPodAutoscaler: enabled: true + virtualCluster: + etcd: + main: + backup: + provider: local + bucketName: gardener-operator + secretRef: + name: virtual-garden-etcd-main-backup-local + namespace: garden + storage: + capacity: 25Gi + # className: default + events: + storage: + capacity: 10Gi + # className: default + maintenance: + timeWindow: + begin: 220000+0100 + end: 230000+0100 diff --git a/pkg/apis/core/v1beta1/generated.proto b/pkg/apis/core/v1beta1/generated.proto index fb435852a56..e95ab61f79e 100644 --- a/pkg/apis/core/v1beta1/generated.proto +++ b/pkg/apis/core/v1beta1/generated.proto @@ -1376,10 +1376,14 @@ message MaintenanceAutoUpdate { message MaintenanceTimeWindow { // Begin is the beginning of the time window in the format HHMMSS+ZONE, e.g. "220000+0100". // If not present, a random value will be computed. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Pattern=`([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00` optional string begin = 1; // End is the end of the time window in the format HHMMSS+ZONE, e.g. "220000+0100". // If not present, the value will be computed based on the "Begin" value. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Pattern=`([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00` optional string end = 2; } diff --git a/pkg/apis/core/v1beta1/types_shoot.go b/pkg/apis/core/v1beta1/types_shoot.go index 89f2a9e19b0..be9b37e3f98 100644 --- a/pkg/apis/core/v1beta1/types_shoot.go +++ b/pkg/apis/core/v1beta1/types_shoot.go @@ -319,9 +319,9 @@ type ShootAdvertisedAddress struct { URL string `json:"url" protobuf:"bytes,2,opt,name=url"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Addons relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Addons is a collection of configuration for specific addons which are managed by the Gardener. type Addons struct { @@ -371,9 +371,9 @@ type NginxIngress struct { ExternalTrafficPolicy *corev1.ServiceExternalTrafficPolicyType `json:"externalTrafficPolicy,omitempty" protobuf:"bytes,4,opt,name=externalTrafficPolicy,casttype=k8s.io/api/core/v1.ServiceExternalTrafficPolicyType"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // ControlPlane relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // ControlPlane holds information about the general settings for the control plane of a shoot. type ControlPlane struct { @@ -383,9 +383,9 @@ type ControlPlane struct { HighAvailability *HighAvailability `json:"highAvailability" protobuf:"bytes,1,name=highAvailability"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // DNS relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // DNS holds information about the provider, the hosted zone id and the domain. type DNS struct { @@ -436,9 +436,9 @@ type DNSIncludeExclude struct { // DefaultDomain is the default value in the Shoot's '.spec.dns.domain' when '.spec.dns.provider' is 'unmanaged' const DefaultDomain = "cluster.local" -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Extension relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Extension contains type and provider information for Shoot extensions. type Extension struct { @@ -452,9 +452,9 @@ type Extension struct { Disabled *bool `json:"disabled,omitempty" protobuf:"varint,3,opt,name=disabled"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // NamedResourceReference relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // NamedResourceReference is a named reference to a resource. type NamedResourceReference struct { @@ -464,9 +464,9 @@ type NamedResourceReference struct { ResourceRef autoscalingv1.CrossVersionObjectReference `json:"resourceRef" protobuf:"bytes,2,opt,name=resourceRef"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Hibernation relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Hibernation contains information whether the Shoot is suspended or not. type Hibernation struct { @@ -494,9 +494,9 @@ type HibernationSchedule struct { Location *string `json:"location,omitempty" protobuf:"bytes,3,opt,name=location"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Kubernetes relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Kubernetes contains the version and configuration variables for the Shoot control plane. type Kubernetes struct { @@ -1146,9 +1146,9 @@ type KubeletConfigReserved struct { PID *resource.Quantity `json:"pid,omitempty" protobuf:"bytes,4,opt,name=pid"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Networking relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Networking defines networking parameters for the shoot cluster. type Networking struct { @@ -1175,9 +1175,9 @@ const ( DefaultServiceNetworkCIDR = "100.64.0.0/13" ) -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Maintenance relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// const ( // MaintenanceTimeWindowDurationMinimum is the minimum duration for a maintenance time window. @@ -1214,15 +1214,19 @@ type MaintenanceAutoUpdate struct { type MaintenanceTimeWindow struct { // Begin is the beginning of the time window in the format HHMMSS+ZONE, e.g. "220000+0100". // If not present, a random value will be computed. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Pattern=`([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00` Begin string `json:"begin" protobuf:"bytes,1,opt,name=begin"` // End is the end of the time window in the format HHMMSS+ZONE, e.g. "220000+0100". // If not present, the value will be computed based on the "Begin" value. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Pattern=`([0-1][0-9]|2[0-3])[0-5][0-9][0-5][0-9]\+[0-1][0-4]00` End string `json:"end" protobuf:"bytes,2,opt,name=end"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Monitoring relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Monitoring contains information about the monitoring configuration for the shoot. type Monitoring struct { @@ -1238,9 +1242,9 @@ type Alerting struct { EmailReceivers []string `json:"emailReceivers,omitempty" protobuf:"bytes,1,rep,name=emailReceivers"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Provider relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Provider contains provider-specific information that are handed-over to the provider-specific // extension controller. @@ -1453,9 +1457,9 @@ var ( DefaultWorkerSystemComponentsAllow = true ) -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // System components relevant types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // SystemComponents contains the settings of system components in the control or data plane of the Shoot cluster. type SystemComponents struct { @@ -1519,9 +1523,9 @@ type NodeLocalDNS struct { DisableForwardToUpstreamDNS *bool `json:"disableForwardToUpstreamDNS,omitempty" protobuf:"varint,4,opt,name=disableForwardToUpstreamDNS"` } -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// // Other/miscellaneous constants and types // -////////////////////////////////////////////////////////////////////////////////////////////////// +// //////////////////////////////////////////////////////////////////////////////////////////////// const ( // ShootMaintenanceFailed indicates that a shoot maintenance operation failed. diff --git a/pkg/apis/operator/v1alpha1/types.go b/pkg/apis/operator/v1alpha1/types.go index c34a5ad1e40..1fe50e37fa3 100644 --- a/pkg/apis/operator/v1alpha1/types.go +++ b/pkg/apis/operator/v1alpha1/types.go @@ -15,9 +15,11 @@ package v1alpha1 import ( - gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1" - + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1" ) // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -53,6 +55,8 @@ type GardenList struct { type GardenSpec struct { // RuntimeCluster contains configuration for the runtime cluster. RuntimeCluster RuntimeCluster `json:"runtimeCluster"` + // VirtualCluster contains configuration for the virtual cluster. + VirtualCluster VirtualCluster `json:"virtualCluster"` } // RuntimeCluster contains configuration for the runtime cluster. @@ -91,6 +95,72 @@ type SettingVerticalPodAutoscaler struct { Enabled *bool `json:"enabled,omitempty"` } +// VirtualCluster contains configuration for the virtual cluster. +type VirtualCluster struct { + // ETCD contains configuration for the etcds of the virtual garden cluster. + // +optional + ETCD *ETCD `json:"etcd,omitempty"` + // Maintenance contains information about the time window for maintenance operations. + Maintenance Maintenance `json:"maintenance"` +} + +// ETCD contains configuration for the etcds of the virtual garden cluster. +type ETCD struct { + // Main contains configuration for the main etcd. + // +optional + Main *ETCDMain `json:"main,omitempty"` + // Events contains configuration for the events etcd. + // +optional + Events *ETCDEvents `json:"events,omitempty"` +} + +// ETCDMain contains configuration for the main etcd. +type ETCDMain struct { + // Backup contains the object store configuration for backups for the virtual garden etcd. + // +optional + Backup *Backup `json:"backup,omitempty"` + // Storage contains storage configuration. + // +optional + Storage *Storage `json:"storage,omitempty"` +} + +// ETCDEvents contains configuration for the events etcd. +type ETCDEvents struct { + // Storage contains storage configuration. + // +optional + Storage *Storage `json:"storage,omitempty"` +} + +// Storage contains storage configuration. +type Storage struct { + // Capacity is the storage capacity for the volumes. + // +kubebuilder:default=`10Gi` + // +optional + Capacity *resource.Quantity `json:"capacity,omitempty"` + // ClassName is the name of a storage class. + // +optional + ClassName *string `json:"className,omitempty"` +} + +// Backup contains the object store configuration for backups for the virtual garden etcd. +type Backup struct { + // Provider is a provider name. This field is immutable. + // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="Value is immutable" + Provider string `json:"provider"` + // BucketName is the name of the backup bucket. + // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="Value is immutable" + BucketName string `json:"bucketName"` + // SecretRef is a reference to a Secret object containing the cloud provider credentials for the object store where + // backups should be stored. It should have enough privileges to manipulate the objects as well as buckets. + SecretRef corev1.SecretReference `json:"secretRef"` +} + +// Maintenance contains information about the time window for maintenance operations. +type Maintenance struct { + // TimeWindow contains information about the time window for maintenance operations. + TimeWindow gardencorev1beta1.MaintenanceTimeWindow `json:"timeWindow"` +} + // GardenStatus is the status of a garden environment. type GardenStatus struct { // Gardener holds information about the Gardener which last acted on the Garden. diff --git a/pkg/apis/operator/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/operator/v1alpha1/zz_generated.deepcopy.go index abe83f787c3..5503894a2ea 100644 --- a/pkg/apis/operator/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/operator/v1alpha1/zz_generated.deepcopy.go @@ -26,6 +26,96 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Backup) DeepCopyInto(out *Backup) { + *out = *in + out.SecretRef = in.SecretRef + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Backup. +func (in *Backup) DeepCopy() *Backup { + if in == nil { + return nil + } + out := new(Backup) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCD) DeepCopyInto(out *ETCD) { + *out = *in + if in.Main != nil { + in, out := &in.Main, &out.Main + *out = new(ETCDMain) + (*in).DeepCopyInto(*out) + } + if in.Events != nil { + in, out := &in.Events, &out.Events + *out = new(ETCDEvents) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCD. +func (in *ETCD) DeepCopy() *ETCD { + if in == nil { + return nil + } + out := new(ETCD) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDEvents) DeepCopyInto(out *ETCDEvents) { + *out = *in + if in.Storage != nil { + in, out := &in.Storage, &out.Storage + *out = new(Storage) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDEvents. +func (in *ETCDEvents) DeepCopy() *ETCDEvents { + if in == nil { + return nil + } + out := new(ETCDEvents) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ETCDMain) DeepCopyInto(out *ETCDMain) { + *out = *in + if in.Backup != nil { + in, out := &in.Backup, &out.Backup + *out = new(Backup) + **out = **in + } + if in.Storage != nil { + in, out := &in.Storage, &out.Storage + *out = new(Storage) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ETCDMain. +func (in *ETCDMain) DeepCopy() *ETCDMain { + if in == nil { + return nil + } + out := new(ETCDMain) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Garden) DeepCopyInto(out *Garden) { *out = *in @@ -91,6 +181,7 @@ func (in *GardenList) DeepCopyObject() runtime.Object { func (in *GardenSpec) DeepCopyInto(out *GardenSpec) { *out = *in in.RuntimeCluster.DeepCopyInto(&out.RuntimeCluster) + in.VirtualCluster.DeepCopyInto(&out.VirtualCluster) return } @@ -132,6 +223,23 @@ func (in *GardenStatus) DeepCopy() *GardenStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Maintenance) DeepCopyInto(out *Maintenance) { + *out = *in + out.TimeWindow = in.TimeWindow + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Maintenance. +func (in *Maintenance) DeepCopy() *Maintenance { + if in == nil { + return nil + } + out := new(Maintenance) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Provider) DeepCopyInto(out *Provider) { *out = *in @@ -216,3 +324,51 @@ func (in *Settings) DeepCopy() *Settings { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Storage) DeepCopyInto(out *Storage) { + *out = *in + if in.Capacity != nil { + in, out := &in.Capacity, &out.Capacity + x := (*in).DeepCopy() + *out = &x + } + if in.ClassName != nil { + in, out := &in.ClassName, &out.ClassName + *out = new(string) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Storage. +func (in *Storage) DeepCopy() *Storage { + if in == nil { + return nil + } + out := new(Storage) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VirtualCluster) DeepCopyInto(out *VirtualCluster) { + *out = *in + if in.ETCD != nil { + in, out := &in.ETCD, &out.ETCD + *out = new(ETCD) + (*in).DeepCopyInto(*out) + } + out.Maintenance = in.Maintenance + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualCluster. +func (in *VirtualCluster) DeepCopy() *VirtualCluster { + if in == nil { + return nil + } + out := new(VirtualCluster) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/operation/botanist/component/etcd/etcd.go b/pkg/operation/botanist/component/etcd/etcd.go index 839b2cd5752..868a1fd48bb 100644 --- a/pkg/operation/botanist/component/etcd/etcd.go +++ b/pkg/operation/botanist/component/etcd/etcd.go @@ -17,6 +17,7 @@ package etcd import ( "context" "fmt" + "strings" "time" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" @@ -124,36 +125,21 @@ type Interface interface { // New creates a new instance of DeployWaiter for the Etcd. func New( - c client.Client, log logr.Logger, + c client.Client, namespace string, secretsManager secretsmanager.Interface, - role string, - class Class, - failureToleranceType *gardencorev1beta1.FailureToleranceType, - replicas *int32, - storageCapacity string, - defragmentationSchedule *string, - caRotationPhase gardencorev1beta1.ShootCredentialsRotationPhase, - k8sVersion string, + values Values, ) Interface { - name := "etcd-" + role + name := values.NamePrefix + "etcd-" + values.Role log = log.WithValues("etcd", client.ObjectKey{Namespace: namespace, Name: name}) return &etcd{ - client: c, - log: log, - namespace: namespace, - secretsManager: secretsManager, - role: role, - class: class, - failureToleranceType: failureToleranceType, - replicas: replicas, - storageCapacity: storageCapacity, - defragmentationSchedule: defragmentationSchedule, - caRotationPhase: caRotationPhase, - k8sVersion: k8sVersion, - + client: c, + log: log, + namespace: namespace, + secretsManager: secretsManager, + values: values, etcd: &druidv1alpha1.Etcd{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -164,25 +150,33 @@ func New( } type etcd struct { - client client.Client - log logr.Logger - namespace string - secretsManager secretsmanager.Interface - role string - class Class - failureToleranceType *gardencorev1beta1.FailureToleranceType - replicas *int32 - storageCapacity string - defragmentationSchedule *string - caRotationPhase gardencorev1beta1.ShootCredentialsRotationPhase - k8sVersion string - etcd *druidv1alpha1.Etcd - backupConfig *BackupConfig - hvpaConfig *HVPAConfig + client client.Client + log logr.Logger + namespace string + secretsManager secretsmanager.Interface + values Values + etcd *druidv1alpha1.Etcd +} + +// Values are the configuration values for the ETCD. +type Values struct { + NamePrefix string + Role string + Class Class + FailureToleranceType *gardencorev1beta1.FailureToleranceType + Replicas *int32 + StorageCapacity string + StorageClassName *string + DefragmentationSchedule *string + CARotationPhase gardencorev1beta1.ShootCredentialsRotationPhase + K8sVersion string + BackupConfig *BackupConfig + HvpaConfig *HVPAConfig + PriorityClassName string } func (e *etcd) hasHAControlPlane() bool { - return helper.IsFailureToleranceTypeNode(e.failureToleranceType) || helper.IsFailureToleranceTypeZone(e.failureToleranceType) + return helper.IsFailureToleranceTypeNode(e.values.FailureToleranceType) || helper.IsFailureToleranceTypeZone(e.values.FailureToleranceType) } func (e *etcd) Deploy(ctx context.Context) error { @@ -227,7 +221,7 @@ func (e *etcd) Deploy(ctx context.Context) error { resourcesEtcd, resourcesBackupRestore = e.computeContainerResources(existingSts) quota = resource.MustParse("8Gi") - storageCapacity = resource.MustParse(e.storageCapacity) + storageCapacity = resource.MustParse(e.values.StorageCapacity) garbageCollectionPolicy = druidv1alpha1.GarbageCollectionPolicy(druidv1alpha1.GarbageCollectionPolicyExponential) garbageCollectionPeriod = metav1.Duration{Duration: 12 * time.Hour} compressionPolicy = druidv1alpha1.GzipCompression @@ -245,10 +239,10 @@ func (e *etcd) Deploy(ctx context.Context) error { } ) - if e.class == ClassImportant { + if e.values.Class == ClassImportant { annotations = map[string]string{"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"} metrics = druidv1alpha1.Extensive - volumeClaimTemplate = e.role + "-etcd" + volumeClaimTemplate = e.values.Role + "-" + strings.TrimSuffix(e.etcd.Name, "-"+e.values.Role) minAllowed = corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("200m"), corev1.ResourceMemory: resource.MustParse("700M"), @@ -261,7 +255,7 @@ func (e *etcd) Deploy(ctx context.Context) error { } serverSecret, err := e.secretsManager.Generate(ctx, &secretutils.CertificateSecretConfig{ - Name: secretNamePrefixServer + e.role, + Name: secretNamePrefixServer + e.values.Role, CommonName: "etcd-server", DNSNames: e.clientServiceDNSNames(), CertType: secretutils.ServerClientCert, @@ -291,97 +285,37 @@ func (e *etcd) Deploy(ctx context.Context) error { return err } - if _, err := controllerutils.GetAndCreateOrMergePatch(ctx, e.client, clientNetworkPolicy, func() error { - clientNetworkPolicy.Annotations = map[string]string{ - v1beta1constants.GardenerDescription: "Allows Ingress to etcd pods from the Shoot's Kubernetes API Server.", - } - clientNetworkPolicy.Labels = map[string]string{ - v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, - } - clientNetworkPolicy.Spec.PodSelector = metav1.LabelSelector{ - MatchLabels: GetLabels(), - } - clientNetworkPolicy.Spec.Ingress = []networkingv1.NetworkPolicyIngressRule{ - { - From: []networkingv1.NetworkPolicyPeer{ - { - PodSelector: &metav1.LabelSelector{ - // TODO: Replace below map with a function call to the to-be-introduced kubeapiserver package. - MatchLabels: map[string]string{ - v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, - v1beta1constants.LabelApp: v1beta1constants.LabelKubernetes, - v1beta1constants.LabelRole: v1beta1constants.LabelAPIServer, - }, - }, - }, - { - PodSelector: &metav1.LabelSelector{ - MatchLabels: monitoring.GetPrometheusLabels(), - }, - }, - }, - Ports: []networkingv1.NetworkPolicyPort{ - { - Protocol: &protocolTCP, - Port: &intStrPortEtcdClient, - }, - { - Protocol: &protocolTCP, - Port: &intStrPortBackupRestore, - }, - }, - }, - } - clientNetworkPolicy.Spec.Egress = nil - clientNetworkPolicy.Spec.PolicyTypes = []networkingv1.PolicyType{networkingv1.PolicyTypeIngress} - return nil - }); err != nil { - return err - } - - // create peer network policy only if the shoot has a HA control plane - if e.hasHAControlPlane() { - if _, err := controllerutils.GetAndCreateOrMergePatch(ctx, e.client, peerNetworkPolicy, func() error { - peerNetworkPolicy.Annotations = map[string]string{ - v1beta1constants.GardenerDescription: "Allows Ingress to etcd pods from etcd pods for peer communication.", + // Without this if condition, both `etcdMain` and `etcdEvents` component deployers execute this code. However, these + // network policies are not specified to them (they apply to both because there is no `role` label in the selector). + // Hence, it doesn't make sense if both component deployers are running this code - let's only do it for the main + // ETCD. + if e.values.Role == v1beta1constants.ETCDRoleMain { + if _, err := controllerutils.GetAndCreateOrMergePatch(ctx, e.client, clientNetworkPolicy, func() error { + clientNetworkPolicy.Annotations = map[string]string{ + v1beta1constants.GardenerDescription: "Allows Ingress to etcd pods from the Shoot's Kubernetes API Server.", } - peerNetworkPolicy.Labels = map[string]string{ + clientNetworkPolicy.Labels = map[string]string{ v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, } - peerNetworkPolicy.Spec.PodSelector = metav1.LabelSelector{ + clientNetworkPolicy.Spec.PodSelector = metav1.LabelSelector{ MatchLabels: GetLabels(), } - peerNetworkPolicy.Spec.Egress = []networkingv1.NetworkPolicyEgressRule{ + clientNetworkPolicy.Spec.Ingress = []networkingv1.NetworkPolicyIngressRule{ { - Ports: []networkingv1.NetworkPolicyPort{ - { - Protocol: &protocolTCP, - Port: &intStrPortEtcdClient, - }, - { - Protocol: &protocolTCP, - Port: &intStrPortBackupRestore, - }, - { - Protocol: &protocolTCP, - Port: &intStrPortEtcdPeer, - }, - }, - To: []networkingv1.NetworkPolicyPeer{ + From: []networkingv1.NetworkPolicyPeer{ { PodSelector: &metav1.LabelSelector{ - MatchLabels: GetLabels(), + // TODO: Replace below map with a function call to the to-be-introduced kubeapiserver package. + MatchLabels: map[string]string{ + v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, + v1beta1constants.LabelApp: v1beta1constants.LabelKubernetes, + v1beta1constants.LabelRole: v1beta1constants.LabelAPIServer, + }, }, }, - }, - }, - } - peerNetworkPolicy.Spec.Ingress = []networkingv1.NetworkPolicyIngressRule{ - { - From: []networkingv1.NetworkPolicyPeer{ { PodSelector: &metav1.LabelSelector{ - MatchLabels: GetLabels(), + MatchLabels: monitoring.GetPrometheusLabels(), }, }, }, @@ -394,21 +328,87 @@ func (e *etcd) Deploy(ctx context.Context) error { Protocol: &protocolTCP, Port: &intStrPortBackupRestore, }, - { - Protocol: &protocolTCP, - Port: &intStrPortEtcdPeer, - }, }, }, } - peerNetworkPolicy.Spec.PolicyTypes = []networkingv1.PolicyType{ - networkingv1.PolicyTypeIngress, - networkingv1.PolicyTypeEgress, - } + clientNetworkPolicy.Spec.Egress = nil + clientNetworkPolicy.Spec.PolicyTypes = []networkingv1.PolicyType{networkingv1.PolicyTypeIngress} return nil }); err != nil { return err } + + // create peer network policy only if the shoot has a HA control plane + if e.hasHAControlPlane() { + if _, err := controllerutils.GetAndCreateOrMergePatch(ctx, e.client, peerNetworkPolicy, func() error { + peerNetworkPolicy.Annotations = map[string]string{ + v1beta1constants.GardenerDescription: "Allows Ingress to etcd pods from etcd pods for peer communication.", + } + peerNetworkPolicy.Labels = map[string]string{ + v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, + } + peerNetworkPolicy.Spec.PodSelector = metav1.LabelSelector{ + MatchLabels: GetLabels(), + } + peerNetworkPolicy.Spec.Egress = []networkingv1.NetworkPolicyEgressRule{ + { + Ports: []networkingv1.NetworkPolicyPort{ + { + Protocol: &protocolTCP, + Port: &intStrPortEtcdClient, + }, + { + Protocol: &protocolTCP, + Port: &intStrPortBackupRestore, + }, + { + Protocol: &protocolTCP, + Port: &intStrPortEtcdPeer, + }, + }, + To: []networkingv1.NetworkPolicyPeer{ + { + PodSelector: &metav1.LabelSelector{ + MatchLabels: GetLabels(), + }, + }, + }, + }, + } + peerNetworkPolicy.Spec.Ingress = []networkingv1.NetworkPolicyIngressRule{ + { + From: []networkingv1.NetworkPolicyPeer{ + { + PodSelector: &metav1.LabelSelector{ + MatchLabels: GetLabels(), + }, + }, + }, + Ports: []networkingv1.NetworkPolicyPort{ + { + Protocol: &protocolTCP, + Port: &intStrPortEtcdClient, + }, + { + Protocol: &protocolTCP, + Port: &intStrPortBackupRestore, + }, + { + Protocol: &protocolTCP, + Port: &intStrPortEtcdPeer, + }, + }, + }, + } + peerNetworkPolicy.Spec.PolicyTypes = []networkingv1.PolicyType{ + networkingv1.PolicyTypeIngress, + networkingv1.PolicyTypeEgress, + } + return nil + }); err != nil { + return err + } + } } if _, err := controllerutils.GetAndCreateOrMergePatch(ctx, e.client, e.etcd, func() error { @@ -416,11 +416,11 @@ func (e *etcd) Deploy(ctx context.Context) error { metav1.SetMetaDataAnnotation(&e.etcd.ObjectMeta, v1beta1constants.GardenerTimestamp, TimeNow().UTC().String()) e.etcd.Labels = map[string]string{ - v1beta1constants.LabelRole: e.role, + v1beta1constants.LabelRole: e.values.Role, v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, } e.etcd.Spec.Replicas = replicas - e.etcd.Spec.PriorityClassName = pointer.String(v1beta1constants.PriorityClassNameShootControlPlane500) + e.etcd.Spec.PriorityClassName = &e.values.PriorityClassName e.etcd.Spec.Annotations = annotations e.etcd.Spec.Labels = utils.MergeStringMaps(e.getRoleLabels(), map[string]string{ v1beta1constants.LabelApp: LabelAppValue, @@ -502,48 +502,49 @@ func (e *etcd) Deploy(ctx context.Context) error { SnapshotCompression: &compressionSpec, } - if e.backupConfig != nil { + if e.values.BackupConfig != nil { var ( - provider = druidv1alpha1.StorageProvider(e.backupConfig.Provider) + provider = druidv1alpha1.StorageProvider(e.values.BackupConfig.Provider) deltaSnapshotPeriod = metav1.Duration{Duration: 5 * time.Minute} deltaSnapshotMemoryLimit = resource.MustParse("100Mi") ) e.etcd.Spec.Backup.Store = &druidv1alpha1.StoreSpec{ - SecretRef: &corev1.SecretReference{Name: e.backupConfig.SecretRefName}, - Container: &e.backupConfig.Container, + SecretRef: &corev1.SecretReference{Name: e.values.BackupConfig.SecretRefName}, + Container: &e.values.BackupConfig.Container, Provider: &provider, - Prefix: fmt.Sprintf("%s/etcd-%s", e.backupConfig.Prefix, e.role), + Prefix: fmt.Sprintf("%s/etcd-%s", e.values.BackupConfig.Prefix, e.values.Role), } e.etcd.Spec.Backup.FullSnapshotSchedule = e.computeFullSnapshotSchedule(existingEtcd) e.etcd.Spec.Backup.DeltaSnapshotPeriod = &deltaSnapshotPeriod e.etcd.Spec.Backup.DeltaSnapshotMemoryLimit = &deltaSnapshotMemoryLimit - if e.backupConfig.LeaderElection != nil { + if e.values.BackupConfig.LeaderElection != nil { e.etcd.Spec.Backup.LeaderElection = &druidv1alpha1.LeaderElectionSpec{ - EtcdConnectionTimeout: e.backupConfig.LeaderElection.EtcdConnectionTimeout, - ReelectionPeriod: e.backupConfig.LeaderElection.ReelectionPeriod, + EtcdConnectionTimeout: e.values.BackupConfig.LeaderElection.EtcdConnectionTimeout, + ReelectionPeriod: e.values.BackupConfig.LeaderElection.ReelectionPeriod, } } } e.etcd.Spec.StorageCapacity = &storageCapacity + e.etcd.Spec.StorageClass = e.values.StorageClassName e.etcd.Spec.VolumeClaimTemplate = &volumeClaimTemplate return nil }); err != nil { return err } - if e.hvpaConfig != nil && e.hvpaConfig.Enabled { + if e.values.HvpaConfig != nil && e.values.HvpaConfig.Enabled { var ( - hpaLabels = map[string]string{v1beta1constants.LabelRole: "etcd-hpa-" + e.role} - vpaLabels = map[string]string{v1beta1constants.LabelRole: "etcd-vpa-" + e.role} + hpaLabels = map[string]string{v1beta1constants.LabelRole: "etcd-hpa-" + e.values.Role} + vpaLabels = map[string]string{v1beta1constants.LabelRole: "etcd-vpa-" + e.values.Role} updateModeAuto = hvpav1alpha1.UpdateModeAuto containerPolicyOff = vpaautoscalingv1.ContainerScalingModeOff controlledValues = vpaautoscalingv1.ContainerControlledValuesRequestsOnly ) - scaleDownUpdateMode := e.hvpaConfig.ScaleDownUpdateMode + scaleDownUpdateMode := e.values.HvpaConfig.ScaleDownUpdateMode if scaleDownUpdateMode == nil { scaleDownUpdateMode = pointer.String(hvpav1alpha1.UpdateModeMaintenanceWindow) } @@ -554,8 +555,8 @@ func (e *etcd) Deploy(ctx context.Context) error { }) hvpa.Spec.Replicas = pointer.Int32(1) hvpa.Spec.MaintenanceTimeWindow = &hvpav1alpha1.MaintenanceTimeWindow{ - Begin: e.hvpaConfig.MaintenanceTimeWindow.Begin, - End: e.hvpaConfig.MaintenanceTimeWindow.End, + Begin: e.values.HvpaConfig.MaintenanceTimeWindow.Begin, + End: e.values.HvpaConfig.MaintenanceTimeWindow.End, } hvpa.Spec.Hpa = hvpav1alpha1.HpaSpec{ Selector: &metav1.LabelSelector{MatchLabels: hpaLabels}, @@ -707,7 +708,7 @@ func (e *etcd) Destroy(ctx context.Context) error { func (e *etcd) getRoleLabels() map[string]string { return utils.MergeStringMaps(map[string]string{ v1beta1constants.GardenRole: v1beta1constants.GardenRoleControlPlane, - v1beta1constants.LabelRole: e.role, + v1beta1constants.LabelRole: e.values.Role, }) } @@ -728,7 +729,7 @@ func (e *etcd) emptyHVPA() *hvpav1alpha1.Hvpa { } func (e *etcd) Snapshot(ctx context.Context, podExecutor kubernetes.PodExecutor) error { - if e.backupConfig == nil { + if e.values.BackupConfig == nil { return fmt.Errorf("no backup is configured for this etcd, cannot make a snapshot") } @@ -747,27 +748,29 @@ func (e *etcd) Snapshot(ctx context.Context, podExecutor kubernetes.PodExecutor) podsList.Items[0].GetName(), containerNameBackupRestore, "/bin/sh", - fmt.Sprintf("curl -k https://etcd-%s-local:%d/snapshot/full?final=true", e.role, PortBackupRestore), + fmt.Sprintf("curl -k https://etcd-%s-local:%d/snapshot/full?final=true", e.values.Role, PortBackupRestore), ) return err } func (e *etcd) clientServiceDNSNames() []string { var domainNames []string - domainNames = append(domainNames, fmt.Sprintf("etcd-%s-local", e.role)) - domainNames = append(domainNames, kutil.DNSNamesForService(fmt.Sprintf("etcd-%s-client", e.role), e.namespace)...) + domainNames = append(domainNames, fmt.Sprintf("%s-local", e.etcd.Name)) + domainNames = append(domainNames, kutil.DNSNamesForService(fmt.Sprintf("%s-client", e.etcd.Name), e.namespace)...) // The peer service needs to be considered here since the etcd-backup-restore side-car // connects to member pods via pod domain names (e.g. for defragmentation). // See https://github.com/gardener/etcd-backup-restore/issues/494 - domainNames = append(domainNames, kutil.DNSNamesForService(fmt.Sprintf("*.etcd-%s-peer", e.role), e.namespace)...) + domainNames = append(domainNames, kutil.DNSNamesForService(fmt.Sprintf("*.%s-peer", e.etcd.Name), e.namespace)...) return domainNames } func (e *etcd) peerServiceDNSNames() []string { - return append(kutil.DNSNamesForService(fmt.Sprintf("etcd-%s-peer", e.role), e.namespace), - kutil.DNSNamesForService(fmt.Sprintf("*.etcd-%s-peer", e.role), e.namespace)...) + return append( + kutil.DNSNamesForService(fmt.Sprintf("%s-peer", e.etcd.Name), e.namespace), + kutil.DNSNamesForService(fmt.Sprintf("*.%s-peer", e.etcd.Name), e.namespace)..., + ) } // Get retrieves the Etcd resource @@ -778,8 +781,8 @@ func (e *etcd) Get(ctx context.Context) (*druidv1alpha1.Etcd, error) { return e.etcd, nil } -func (e *etcd) SetBackupConfig(backupConfig *BackupConfig) { e.backupConfig = backupConfig } -func (e *etcd) SetHVPAConfig(hvpaConfig *HVPAConfig) { e.hvpaConfig = hvpaConfig } +func (e *etcd) SetBackupConfig(backupConfig *BackupConfig) { e.values.BackupConfig = backupConfig } +func (e *etcd) SetHVPAConfig(hvpaConfig *HVPAConfig) { e.values.HvpaConfig = hvpaConfig } func (e *etcd) Scale(ctx context.Context, replicas int32) error { etcdObj := &druidv1alpha1.Etcd{} @@ -856,7 +859,7 @@ func (e *etcd) RolloutPeerCA(ctx context.Context) error { func (e *etcd) podLabelSelector() labels.Selector { app, _ := labels.NewRequirement(v1beta1constants.LabelApp, selection.Equals, []string{LabelAppValue}) - role, _ := labels.NewRequirement(v1beta1constants.LabelRole, selection.Equals, []string{e.role}) + role, _ := labels.NewRequirement(v1beta1constants.LabelRole, selection.Equals, []string{e.values.Role}) return labels.NewSelector().Add(*role, *app) } @@ -876,7 +879,7 @@ func (e *etcd) computeContainerResources(existingSts *appsv1.StatefulSet) (*core } ) - if existingSts != nil && e.hvpaConfig != nil && e.hvpaConfig.Enabled { + if existingSts != nil && e.values.HvpaConfig != nil && e.values.HvpaConfig.Enabled { for k := range existingSts.Spec.Template.Spec.Containers { v := existingSts.Spec.Template.Spec.Containers[k] switch v.Name { @@ -896,8 +899,8 @@ func (e *etcd) computeContainerResources(existingSts *appsv1.StatefulSet) (*core } func (e *etcd) computeReplicas(existingEtcd *druidv1alpha1.Etcd) int32 { - if e.replicas != nil { - return *e.replicas + if e.values.Replicas != nil { + return *e.values.Replicas } if existingEtcd != nil { @@ -907,7 +910,7 @@ func (e *etcd) computeReplicas(existingEtcd *druidv1alpha1.Etcd) int32 { } func (e *etcd) computeDefragmentationSchedule(existingEtcd *druidv1alpha1.Etcd) *string { - defragmentationSchedule := e.defragmentationSchedule + defragmentationSchedule := e.values.DefragmentationSchedule if existingEtcd != nil && existingEtcd.Spec.Etcd.DefragmentationSchedule != nil { defragmentationSchedule = existingEtcd.Spec.Etcd.DefragmentationSchedule } @@ -915,7 +918,7 @@ func (e *etcd) computeDefragmentationSchedule(existingEtcd *druidv1alpha1.Etcd) } func (e *etcd) computeFullSnapshotSchedule(existingEtcd *druidv1alpha1.Etcd) *string { - fullSnapshotSchedule := &e.backupConfig.FullSnapshotSchedule + fullSnapshotSchedule := &e.values.BackupConfig.FullSnapshotSchedule if existingEtcd != nil && existingEtcd.Spec.Backup.FullSnapshotSchedule != nil { fullSnapshotSchedule = existingEtcd.Spec.Backup.FullSnapshotSchedule } @@ -935,12 +938,12 @@ func (e *etcd) handlePeerCertificates(ctx context.Context) (caSecretName, peerSe var singedByCAOptions []secretsmanager.SignedByCAOption - if e.caRotationPhase == gardencorev1beta1.RotationPreparing { + if e.values.CARotationPhase == gardencorev1beta1.RotationPreparing { singedByCAOptions = append(singedByCAOptions, secretsmanager.UseCurrentCA) } peerServerSecret, err := e.secretsManager.Generate(ctx, &secretutils.CertificateSecretConfig{ - Name: secretNamePrefixPeerServer + e.role, + Name: secretNamePrefixPeerServer + e.values.Role, CommonName: "etcd-server", DNSNames: e.peerServiceDNSNames(), CertType: secretutils.ServerClientCert, diff --git a/pkg/operation/botanist/component/etcd/etcd_suite_test.go b/pkg/operation/botanist/component/etcd/etcd_suite_test.go index 9a1720ab3e1..552084128e5 100644 --- a/pkg/operation/botanist/component/etcd/etcd_suite_test.go +++ b/pkg/operation/botanist/component/etcd/etcd_suite_test.go @@ -33,8 +33,8 @@ func TestEtcd(t *testing.T) { const ( testNamespace = "shoot--test--test" - testRole = "test" - testROLE = "Test" + testRole = "main" + testROLE = "Main" ) var _ = BeforeSuite(func() { diff --git a/pkg/operation/botanist/component/etcd/etcd_test.go b/pkg/operation/botanist/component/etcd/etcd_test.go index 7b5daa84c63..e6c00ae9be4 100644 --- a/pkg/operation/botanist/component/etcd/etcd_test.go +++ b/pkg/operation/botanist/component/etcd/etcd_test.go @@ -80,7 +80,9 @@ var _ = Describe("Etcd", func() { replicas = pointer.Int32(1) storageCapacity = "12Gi" storageCapacityQuantity = resource.MustParse(storageCapacity) + storageClassName = "my-storage-class" defragmentationSchedule = "abcd" + priorityClassName = "some-priority-class" secretNameCA = "ca-etcd" secretNamePeerCA = "ca-etcd-peer" @@ -271,7 +273,6 @@ var _ = Describe("Etcd", func() { peerCASecretName *string, peerServerSecretName *string, ) *druidv1alpha1.Etcd { - defragSchedule := defragmentationSchedule if existingDefragmentationSchedule != "" { defragSchedule = existingDefragmentationSchedule @@ -312,7 +313,7 @@ var _ = Describe("Etcd", func() { }, Spec: druidv1alpha1.EtcdSpec{ Replicas: replicas, - PriorityClassName: pointer.String("gardener-system-500"), + PriorityClassName: &priorityClassName, Labels: map[string]string{ "gardener.cloud/role": "controlplane", "role": testRole, @@ -379,6 +380,7 @@ var _ = Describe("Etcd", func() { SnapshotCompression: &compressionSpec, }, StorageCapacity: &storageCapacityQuantity, + StorageClass: &storageClassName, VolumeClaimTemplate: pointer.String(etcdName), }, } @@ -619,7 +621,18 @@ var _ = Describe("Etcd", func() { By("creating secrets managed outside of this package for whose secretsmanager.Get() will be called") Expect(fakeClient.Create(ctx, &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: "ca-etcd", Namespace: testNamespace}})).To(Succeed()) - etcd = New(c, log, testNamespace, sm, testRole, class, failureToleranceType, replicas, storageCapacity, &defragmentationSchedule, "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: failureToleranceType, + Replicas: replicas, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: "", + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) }) AfterEach(func() { @@ -781,7 +794,18 @@ var _ = Describe("Etcd", func() { existingReplicas int32 = 245 ) - etcd = New(c, log, testNamespace, sm, testRole, class, failureToleranceType, nil, storageCapacity, &defragmentationSchedule, "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: nil, + Replicas: nil, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: "", + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) setHVPAConfig() gomock.InOrder( @@ -841,7 +865,18 @@ var _ = Describe("Etcd", func() { existingReplicas int32 = 245 ) - etcd = New(c, log, testNamespace, sm, testRole, class, failureToleranceType, nil, storageCapacity, &defragmentationSchedule, "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: failureToleranceType, + Replicas: nil, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: "", + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) setHVPAConfig() gomock.InOrder( @@ -1132,7 +1167,18 @@ var _ = Describe("Etcd", func() { replicas = pointer.Int32(1) - etcd = New(c, log, testNamespace, sm, testRole, class, failureToleranceType, replicas, storageCapacity, &defragmentationSchedule, "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: failureToleranceType, + Replicas: replicas, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: "", + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) newSetHVPAConfigFunc(updateMode)() gomock.InOrder( @@ -1341,7 +1387,18 @@ var _ = Describe("Etcd", func() { }) JustBeforeEach(func() { - etcd = New(c, log, testNamespace, sm, testRole, class, failureToleranceType, replicas, storageCapacity, &defragmentationSchedule, rotationPhase, "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: failureToleranceType, + Replicas: replicas, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: rotationPhase, + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) }) Context("when CA rotation phase is in `Preparing` state", func() { @@ -1411,14 +1468,14 @@ var _ = Describe("Etcd", func() { Name: "etcd-peer-server-" + testRole, CommonName: "etcd-server", DNSNames: []string{ - "etcd-test-peer", - "etcd-test-peer.shoot--test--test", - "etcd-test-peer.shoot--test--test.svc", - "etcd-test-peer.shoot--test--test.svc.cluster.local", - "*.etcd-test-peer", - "*.etcd-test-peer.shoot--test--test", - "*.etcd-test-peer.shoot--test--test.svc", - "*.etcd-test-peer.shoot--test--test.svc.cluster.local", + "etcd-" + testRole + "-peer", + "etcd-" + testRole + "-peer.shoot--test--test", + "etcd-" + testRole + "-peer.shoot--test--test.svc", + "etcd-" + testRole + "-peer.shoot--test--test.svc.cluster.local", + "*.etcd-" + testRole + "-peer", + "*.etcd-" + testRole + "-peer.shoot--test--test", + "*.etcd-" + testRole + "-peer.shoot--test--test.svc", + "*.etcd-" + testRole + "-peer.shoot--test--test.svc.cluster.local", }, CertType: secretutils.ServerClientCert, SkipPublishingCACertificate: true, @@ -1437,15 +1494,15 @@ var _ = Describe("Etcd", func() { Name: "etcd-server-" + testRole, CommonName: "etcd-server", DNSNames: []string{ - "etcd-test-local", - "etcd-test-client", - "etcd-test-client.shoot--test--test", - "etcd-test-client.shoot--test--test.svc", - "etcd-test-client.shoot--test--test.svc.cluster.local", - "*.etcd-test-peer", - "*.etcd-test-peer.shoot--test--test", - "*.etcd-test-peer.shoot--test--test.svc", - "*.etcd-test-peer.shoot--test--test.svc.cluster.local", + "etcd-" + testRole + "-local", + "etcd-" + testRole + "-client", + "etcd-" + testRole + "-client.shoot--test--test", + "etcd-" + testRole + "-client.shoot--test--test.svc", + "etcd-" + testRole + "-client.shoot--test--test.svc.cluster.local", + "*.etcd-" + testRole + "-peer", + "*.etcd-" + testRole + "-peer.shoot--test--test", + "*.etcd-" + testRole + "-peer.shoot--test--test.svc", + "*.etcd-" + testRole + "-peer.shoot--test--test.svc.cluster.local", }, CertType: secretutils.ServerClientCert, SkipPublishingCACertificate: true, @@ -1500,7 +1557,18 @@ var _ = Describe("Etcd", func() { ) JustBeforeEach(func() { - etcd = New(c, log, testNamespace, sm, testRole, class, shootFailureToleranceType, replicas, storageCapacity, &defragmentationSchedule, "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: shootFailureToleranceType, + Replicas: replicas, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: "", + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) }) BeforeEach(func() { @@ -1690,7 +1758,7 @@ var _ = Describe("Etcd", func() { BeforeEach(func() { etcdObj = &druidv1alpha1.Etcd{ ObjectMeta: metav1.ObjectMeta{ - Name: "etcd-test", + Name: "etcd-" + testRole, Namespace: testNamespace, }, } @@ -1796,7 +1864,18 @@ var _ = Describe("Etcd", func() { var failureToleranceTypeZone *gardencorev1beta1.FailureToleranceType JustBeforeEach(func() { - etcd = New(c, log, testNamespace, sm, testRole, class, failureToleranceTypeZone, replicas, storageCapacity, &defragmentationSchedule, "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: class, + FailureToleranceType: failureToleranceTypeZone, + Replicas: replicas, + StorageCapacity: storageCapacity, + StorageClassName: &storageClassName, + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: "", + K8sVersion: "1.20.1", + PriorityClassName: priorityClassName, + }) }) Context("when HA control-plane is not requested", func() { diff --git a/pkg/operation/botanist/component/etcd/monitoring.go b/pkg/operation/botanist/component/etcd/monitoring.go index 833ec6db345..0da917348b3 100644 --- a/pkg/operation/botanist/component/etcd/monitoring.go +++ b/pkg/operation/botanist/component/etcd/monitoring.go @@ -373,7 +373,7 @@ func init() { func (e *etcd) ScrapeConfigs() ([]string, error) { values := map[string]interface{}{ "namespace": e.namespace, - "role": e.role, + "role": e.values.Role, } var scrapeConfigEtcd bytes.Buffer @@ -396,22 +396,22 @@ func (e *etcd) ScrapeConfigs() ([]string, error) { func (e *etcd) AlertingRules() (map[string]string, error) { var alertingRules bytes.Buffer - k8sGTE121, err := versionutils.CompareVersions(e.k8sVersion, ">=", "1.21") + k8sGTE121, err := versionutils.CompareVersions(e.values.K8sVersion, ">=", "1.21") if err != nil { return nil, err } etcdReplicas := int32(1) - if e.replicas != nil { - etcdReplicas = *e.replicas + if e.values.Replicas != nil { + etcdReplicas = *e.values.Replicas } if err := monitoringAlertingRulesTemplate.Execute(&alertingRules, map[string]interface{}{ - "role": e.role, - "Role": strings.Title(e.role), - "class": e.class, + "role": e.values.Role, + "Role": strings.Title(e.values.Role), + "class": e.values.Class, "classImportant": ClassImportant, - "backupEnabled": e.backupConfig != nil, + "backupEnabled": e.values.BackupConfig != nil, "k8sGTE121": k8sGTE121, "etcdQuorumReplicas": int(etcdReplicas/2) + 1, "isHA": etcdReplicas > 1, @@ -419,5 +419,5 @@ func (e *etcd) AlertingRules() (map[string]string, error) { return nil, err } - return map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", e.role): alertingRules.String()}, nil + return map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", e.values.Role): alertingRules.String()}, nil } diff --git a/pkg/operation/botanist/component/etcd/monitoring_test.go b/pkg/operation/botanist/component/etcd/monitoring_test.go index 116a136ec35..e203061b2d2 100644 --- a/pkg/operation/botanist/component/etcd/monitoring_test.go +++ b/pkg/operation/botanist/component/etcd/monitoring_test.go @@ -18,18 +18,23 @@ import ( "fmt" "path/filepath" - . "github.com/gardener/gardener/pkg/operation/botanist/component/etcd" - "github.com/gardener/gardener/pkg/operation/botanist/component/test" - "k8s.io/utils/pointer" - "github.com/go-logr/logr" . "github.com/onsi/ginkgo/v2" + "k8s.io/utils/pointer" + + . "github.com/gardener/gardener/pkg/operation/botanist/component/etcd" + "github.com/gardener/gardener/pkg/operation/botanist/component/test" ) var _ = Describe("Monitoring", func() { Describe("#ScrapeConfig", func() { It("should successfully test the scrape configuration", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassNormal, nil, pointer.Int32Ptr(1), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassNormal, + Replicas: pointer.Int32(1), + K8sVersion: "1.20.1", + }) test.ScrapeConfigs(etcd, expectedScrapeConfigEtcd, expectedScrapeConfigBackupRestore) }) }) @@ -38,7 +43,12 @@ var _ = Describe("Monitoring", func() { Context("for single-node etcd", func() { Context("w/o backup", func() { It("should successfully test the alerting rules (normal) for single-node etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassNormal, nil, pointer.Int32Ptr(1), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassNormal, + Replicas: pointer.Int32(1), + K8sVersion: "1.20.1", + }) test.AlertingRulesWithPromtool( etcd, map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", testRole): expectedAlertingRulesNormalSingleNodeWithoutBackup}, @@ -47,7 +57,12 @@ var _ = Describe("Monitoring", func() { }) It("should successfully test the alerting rules (important) for single-node etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassImportant, nil, pointer.Int32Ptr(1), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassImportant, + Replicas: pointer.Int32(1), + K8sVersion: "1.20.1", + }) test.AlertingRulesWithPromtool( etcd, map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", testRole): expectedAlertingRulesImportantSingleNodeWithoutBackup}, @@ -56,7 +71,12 @@ var _ = Describe("Monitoring", func() { }) It("should successfully test the alerting rules for k8s >= 1.21 (normal) for single-node etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassNormal, nil, pointer.Int32Ptr(1), "", nil, "", "1.21.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassNormal, + Replicas: pointer.Int32(1), + K8sVersion: "1.21.1", + }) test.AlertingRulesWithPromtool( etcd, map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", testRole): expectedAlertingRulesNormalK8SGTE121}, @@ -67,7 +87,12 @@ var _ = Describe("Monitoring", func() { Context("w/ backup", func() { It("should successfully test the alerting rules (normal) for single-node etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassNormal, nil, pointer.Int32Ptr(1), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassNormal, + Replicas: pointer.Int32(1), + K8sVersion: "1.20.1", + }) etcd.SetBackupConfig(&BackupConfig{}) test.AlertingRulesWithPromtool( etcd, @@ -77,7 +102,12 @@ var _ = Describe("Monitoring", func() { }) It("should successfully test the alerting rules (important) for single-node etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassImportant, nil, pointer.Int32Ptr(1), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassImportant, + Replicas: pointer.Int32(1), + K8sVersion: "1.20.1", + }) etcd.SetBackupConfig(&BackupConfig{}) test.AlertingRulesWithPromtool( etcd, @@ -90,7 +120,12 @@ var _ = Describe("Monitoring", func() { Context("for multinode etcd", func() { Context("w/o backup", func() { It("should successfully test the alerting rules (normal) for multinode etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassNormal, nil, pointer.Int32Ptr(3), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassNormal, + Replicas: pointer.Int32(3), + K8sVersion: "1.20.1", + }) test.AlertingRulesWithPromtool( etcd, map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", testRole): expectedAlertingRulesNormalMultiNodeWithoutBackup}, @@ -99,7 +134,12 @@ var _ = Describe("Monitoring", func() { }) It("should successfully test the alerting rules (important) for multinode etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassImportant, nil, pointer.Int32Ptr(3), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassImportant, + Replicas: pointer.Int32(3), + K8sVersion: "1.20.1", + }) test.AlertingRulesWithPromtool( etcd, map[string]string{fmt.Sprintf("kube-etcd3-%s.rules.yaml", testRole): expectedAlertingRulesImportantMultiNodeWithoutBackup}, @@ -110,7 +150,12 @@ var _ = Describe("Monitoring", func() { Context("w/ backup", func() { It("should successfully test the alerting rules (normal) for multinode etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassNormal, nil, pointer.Int32Ptr(3), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassNormal, + Replicas: pointer.Int32(3), + K8sVersion: "1.20.1", + }) etcd.SetBackupConfig(&BackupConfig{}) test.AlertingRulesWithPromtool( etcd, @@ -120,7 +165,12 @@ var _ = Describe("Monitoring", func() { }) It("should successfully test the alerting rules (important) for multinode etcd", func() { - etcd := New(nil, logr.Discard(), testNamespace, nil, testRole, ClassImportant, nil, pointer.Int32Ptr(3), "", nil, "", "1.20.1") + etcd := New(logr.Discard(), nil, testNamespace, nil, Values{ + Role: testRole, + Class: ClassImportant, + Replicas: pointer.Int32(3), + K8sVersion: "1.20.1", + }) etcd.SetBackupConfig(&BackupConfig{}) test.AlertingRulesWithPromtool( etcd, diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_with_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_with_backup.yaml index 076736257b8..463c6085fec 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_with_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_with_backup.yaml @@ -1,44 +1,44 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + - series: 'up{job="kube-etcd3-main"}' values: '1+0x20' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x20' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB # KubeEtcdDeltaBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Incr",pod="etcd",role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Incr",pod="etcd",role="test"}' values: '0+0x62' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Incr",pod="etcd",role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Incr",pod="etcd",role="test"}' values: '1+0x62' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd",role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd",role="test"}' values: '1+0x100' # KubeEtcdFullBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Full",pod="etcd",role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Full",pod="etcd",role="test"}' values: '0+0x2912' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Full",pod="etcd",role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Full",pod="etcd",role="test"}' values: '1+0x2912' - - series: 'etcd_server_is_leader{job="kube-etcd3-test",pod="etcd",role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main",pod="etcd",role="test"}' values: '1+0x2912' # KubeEtcdRestorationFailed - - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-test",succeeded="false"}' + - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-main",succeeded="false"}' values: '0+0x7 1 2 2' alert_rule_test: - eval_time: 5m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -46,10 +46,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -57,8 +57,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. Possible network partition in the etcd cluster. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. Possible network partition in the etcd cluster. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -68,41 +68,41 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. - eval_time: 31m alertname: KubeEtcdDeltaBackupFailed exp_alerts: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Incr service: etcd severity: critical @@ -117,7 +117,7 @@ tests: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Full service: etcd severity: critical @@ -130,7 +130,7 @@ tests: alertname: KubeEtcdRestorationFailed exp_alerts: - exp_labels: - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main succeeded: false service: etcd severity: critical diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_without_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_without_backup.yaml index 74850499d4d..ec5191e1f9c 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_without_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_multinode_without_backup.yaml @@ -1,27 +1,27 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + - series: 'up{job="kube-etcd3-main"}' values: '1+0x20' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x20' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB alert_rule_test: - eval_time: 5m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -29,10 +29,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -40,8 +40,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. Possible network partition in the etcd cluster. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. Possible network partition in the etcd cluster. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -51,31 +51,31 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_with_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_with_backup.yaml index 4920ddfccbd..dfbe2b31b40 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_with_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_with_backup.yaml @@ -1,44 +1,44 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + - series: 'up{job="kube-etcd3-main"}' values: '0+0x20' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x20' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB # KubeEtcdDeltaBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Incr", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Incr", pod="etcd", role="test"}' values: '0+0x62' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Incr", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Incr", pod="etcd", role="test"}' values: '1+0x62' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd", role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd", role="test"}' values: '1+0x62' # KubeEtcdFullBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Full", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Full", pod="etcd", role="test"}' values: '0+0x2912' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Full", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Full", pod="etcd", role="test"}' values: '1+0x2912' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd", role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd", role="test"}' values: '1+0x2912' # KubeEtcdRestorationFailed - - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-test",succeeded="false"}' + - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-main",succeeded="false"}' values: '0+0x7 1 2 2' alert_rule_test: - eval_time: 5m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -46,10 +46,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -57,8 +57,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -68,41 +68,41 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. - eval_time: 31m alertname: KubeEtcdDeltaBackupFailed exp_alerts: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Incr service: etcd severity: critical @@ -117,7 +117,7 @@ tests: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Full service: etcd severity: critical @@ -130,7 +130,7 @@ tests: alertname: KubeEtcdRestorationFailed exp_alerts: - exp_labels: - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main succeeded: false service: etcd severity: critical diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_without_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_without_backup.yaml index 1cc0e74f2d6..30c83dd5739 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_without_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_important_singlenode_without_backup.yaml @@ -1,27 +1,27 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + - series: 'up{job="kube-etcd3-main"}' values: '0+0x20' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x20' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB alert_rule_test: - eval_time: 5m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -29,10 +29,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -40,8 +40,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -51,31 +51,31 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_with_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_with_backup.yaml index ac8f2644e2b..70557f09432 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_with_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_with_backup.yaml @@ -1,50 +1,50 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - # KubeEtcdBackupRestoreTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + # KubeEtcdBackupRestoreMainDown + - series: 'up{job="kube-etcd3-main"}' values: '1+0x70' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x30' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB # KubeEtcdDeltaBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Incr", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Incr", pod="etcd", role="test"}' values: '0+0x62' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Incr", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Incr", pod="etcd", role="test"}' values: '1+0x62' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd", role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd", role="test"}' values: '1+0x70' # KubeEtcdFullBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Full", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Full", pod="etcd", role="test"}' values: '0+0x2912' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Full", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Full", pod="etcd", role="test"}' values: '1+0x2912' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd", role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd", role="test"}' values: '1+0x2912' # KubeEtcdRestorationFailed - - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-test",succeeded="false"}' + - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-main",succeeded="false"}' values: '0+0x7 1 2 2' - # KubeEtcdBackupRestoreTestDown - - series: 'up{job="kube-etcd3-backup-restore-test"}' + # KubeEtcdBackupRestoreMainDown + - series: 'up{job="kube-etcd3-backup-restore-main"}' values: '0+0x70' - - series: 'etcdbr_snapshotter_failure{job="kube-etcd3-backup-restore-test"}' + - series: 'etcdbr_snapshotter_failure{job="kube-etcd3-backup-restore-main"}' values: '1+1x30 1+0x40' alert_rule_test: - eval_time: 15m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -52,10 +52,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -63,8 +63,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. Possible network partition in the etcd cluster. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. Possible network partition in the etcd cluster. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -74,41 +74,41 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. - eval_time: 31m alertname: KubeEtcdDeltaBackupFailed exp_alerts: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Incr service: etcd severity: critical @@ -123,7 +123,7 @@ tests: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Full service: etcd severity: critical @@ -136,7 +136,7 @@ tests: alertname: KubeEtcdRestorationFailed exp_alerts: - exp_labels: - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main succeeded: false service: etcd severity: critical @@ -146,7 +146,7 @@ tests: description: Etcd data restoration was triggered, but has failed. summary: Etcd data restoration failure. - eval_time: 30m - alertname: KubeEtcdBackupRestoreTestDown + alertname: KubeEtcdBackupRestoreMainDown exp_alerts: - exp_labels: service: etcd @@ -154,5 +154,5 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd backup restore test process down or snapshotter failed with error. Backups will not be triggered unless backup restore is brought back up. This is unsafe behaviour and may cause data loss. - summary: Etcd backup restore test process down or snapshotter failed with error + description: Etcd backup restore main process down or snapshotter failed with error. Backups will not be triggered unless backup restore is brought back up. This is unsafe behaviour and may cause data loss. + summary: Etcd backup restore main process down or snapshotter failed with error diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_without_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_without_backup.yaml index c82ba935fc9..cbf931269a2 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_without_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_multinode_without_backup.yaml @@ -1,27 +1,27 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + - series: 'up{job="kube-etcd3-main"}' values: '1+0x30' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x30' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB alert_rule_test: - eval_time: 15m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -29,10 +29,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable (due to possible quorum loss) or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -40,8 +40,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. Possible network partition in the etcd cluster. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. Possible network partition in the etcd cluster. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -51,31 +51,31 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_with_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_with_backup.yaml index 6b4fb58e290..9d3f75e46c4 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_with_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_with_backup.yaml @@ -1,50 +1,50 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - # KubeEtcdBackupRestoreTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + # KubeEtcdBackupRestoreMainDown + - series: 'up{job="kube-etcd3-main"}' values: '0+0x30 1+0x40' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x30' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB # KubeEtcdDeltaBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Incr", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Incr", pod="etcd", role="test"}' values: '0+0x62' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Incr", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Incr", pod="etcd", role="test"}' values: '1+0x62' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd", role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd", role="test"}' values: '1+0x62' # KubeEtcdFullBackupFailed - - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-test",kind="Full", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_latest_timestamp{job="kube-etcd3-backup-restore-main",kind="Full", pod="etcd", role="test"}' values: '0+0x2912' - - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-test",kind="Full", pod="etcd", role="test"}' + - series: 'etcdbr_snapshot_required{job="kube-etcd3-backup-restore-main",kind="Full", pod="etcd", role="test"}' values: '1+0x2912' - - series: 'etcd_server_is_leader{job="kube-etcd3-test", pod="etcd", role="test"}' + - series: 'etcd_server_is_leader{job="kube-etcd3-main", pod="etcd", role="test"}' values: '1+0x2912' # KubeEtcdRestorationFailed - - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-test",succeeded="false"}' + - series: 'etcdbr_restoration_duration_seconds_count{job="kube-etcd3-backup-restore-main",succeeded="false"}' values: '0+0x7 1 2 2' - # KubeEtcdBackupRestoreTestDown - - series: 'up{job="kube-etcd3-backup-restore-test"}' + # KubeEtcdBackupRestoreMainDown + - series: 'up{job="kube-etcd3-backup-restore-main"}' values: '0+0x60 1+0x10' - - series: 'etcdbr_snapshotter_failure{job="kube-etcd3-backup-restore-test"}' + - series: 'etcdbr_snapshotter_failure{job="kube-etcd3-backup-restore-main"}' values: '1+1x30 1+0x40' alert_rule_test: - eval_time: 15m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -52,10 +52,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -63,8 +63,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -74,41 +74,41 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. - eval_time: 31m alertname: KubeEtcdDeltaBackupFailed exp_alerts: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Incr service: etcd severity: critical @@ -123,7 +123,7 @@ tests: - exp_labels: pod: etcd role: test - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main kind: Full service: etcd severity: critical @@ -136,7 +136,7 @@ tests: alertname: KubeEtcdRestorationFailed exp_alerts: - exp_labels: - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main succeeded: false service: etcd severity: critical @@ -146,19 +146,19 @@ tests: description: Etcd data restoration was triggered, but has failed. summary: Etcd data restoration failure. - eval_time: 16m - alertname: KubeEtcdBackupRestoreTestDown + alertname: KubeEtcdBackupRestoreMainDown exp_alerts: - exp_labels: - job: kube-etcd3-backup-restore-test + job: kube-etcd3-backup-restore-main service: etcd severity: critical type: seed visibility: operator exp_annotations: - description: Etcd backup restore test process down or snapshotter failed with error. Backups will not be triggered unless backup restore is brought back up. This is unsafe behaviour and may cause data loss. - summary: Etcd backup restore test process down or snapshotter failed with error + description: Etcd backup restore main process down or snapshotter failed with error. Backups will not be triggered unless backup restore is brought back up. This is unsafe behaviour and may cause data loss. + summary: Etcd backup restore main process down or snapshotter failed with error - eval_time: 30m - alertname: KubeEtcdBackupRestoreTestDown + alertname: KubeEtcdBackupRestoreMainDown exp_alerts: - exp_labels: service: etcd @@ -166,8 +166,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd backup restore test process down or snapshotter failed with error. Backups will not be triggered unless backup restore is brought back up. This is unsafe behaviour and may cause data loss. - summary: Etcd backup restore test process down or snapshotter failed with error + description: Etcd backup restore main process down or snapshotter failed with error. Backups will not be triggered unless backup restore is brought back up. This is unsafe behaviour and may cause data loss. + summary: Etcd backup restore main process down or snapshotter failed with error - eval_time: 35m - alertname: KubeEtcdBackupRestoreTestDown + alertname: KubeEtcdBackupRestoreMainDown exp_alerts: diff --git a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_without_backup.yaml b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_without_backup.yaml index c88b2ebc273..03c3132ee68 100644 --- a/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_without_backup.yaml +++ b/pkg/operation/botanist/component/etcd/testdata/monitoring_alertingrules_normal_singlenode_without_backup.yaml @@ -1,27 +1,27 @@ rule_files: -- kube-etcd3-test.rules.yaml +- kube-etcd3-main.rules.yaml evaluation_interval: 30s tests: - interval: 30s input_series: - # KubeEtcdTestDown - - series: 'up{job="kube-etcd3-test"}' + # KubeEtcdMainDown + - series: 'up{job="kube-etcd3-main"}' values: '0+0x30' - # KubeEtcd3TestNoLeader - - series: 'etcd_server_has_leader{job="kube-etcd3-test"}' + # KubeEtcd3MainNoLeader + - series: 'etcd_server_has_leader{job="kube-etcd3-main"}' values: '0+0x30' # KubeEtcd3HighNumberOfFailedProposals - - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-test", pod="etcd"}' + - series: 'etcd_server_proposals_failed_total{job="kube-etcd3-main", pod="etcd"}' values: '0+1x6 6+0x115' # KubeEtcd3DbSizeLimitApproaching # KubeEtcd3DbSizeLimitCrossed - - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-test"}' + - series: 'etcd_mvcc_db_total_size_in_bytes{job="kube-etcd3-main"}' values: '7194070000+107374182x20' # 6.7GB 6.8GB 6.9GB .. 7.7GB alert_rule_test: - eval_time: 15m - alertname: KubeEtcdTestDown + alertname: KubeEtcdMainDown exp_alerts: - exp_labels: service: etcd @@ -29,10 +29,10 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 cluster test is unavailable or cannot be scraped. As long as etcd3 test is down, the cluster is unreachable. - summary: Etcd3 test cluster down. + description: Etcd3 cluster main is unavailable or cannot be scraped. As long as etcd3 main is down, the cluster is unreachable. + summary: Etcd3 main cluster down. - eval_time: 15m - alertname: KubeEtcd3TestNoLeader + alertname: KubeEtcd3MainNoLeader exp_alerts: - exp_labels: service: etcd @@ -40,8 +40,8 @@ tests: type: seed visibility: operator exp_annotations: - description: Etcd3 test has no leader. - summary: Etcd3 test has no leader. + description: Etcd3 main has no leader. + summary: Etcd3 main has no leader. - eval_time: 1h alertname: KubeEtcd3HighNumberOfFailedProposals exp_alerts: @@ -51,31 +51,31 @@ tests: type: seed visibility: operator pod: etcd - job: kube-etcd3-test + job: kube-etcd3-main exp_annotations: - description: Etcd3 test pod etcd has seen 6 proposal failures within the last hour. + description: Etcd3 main pod etcd has seen 6 proposal failures within the last hour. summary: High number of failed etcd proposals - eval_time: 5m alertname: KubeEtcd3DbSizeLimitApproaching exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: warning type: seed visibility: all exp_annotations: - description: Etcd3 test DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. - summary: Etcd3 test DB size is approaching its current practical limit. + description: Etcd3 main DB size is approaching its current practical limit of 8GB. Etcd quota might need to be increased. + summary: Etcd3 main DB size is approaching its current practical limit. - eval_time: 10m alertname: KubeEtcd3DbSizeLimitCrossed exp_alerts: - exp_labels: - job: kube-etcd3-test + job: kube-etcd3-main service: etcd severity: critical type: seed visibility: all exp_annotations: - description: Etcd3 test DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. - summary: Etcd3 test DB size has crossed its current practical limit. + description: Etcd3 main DB size has crossed its current practical limit of 8GB. Etcd quota must be increased to allow updates. + summary: Etcd3 main DB size has crossed its current practical limit. diff --git a/pkg/operation/botanist/component/etcd/waiter.go b/pkg/operation/botanist/component/etcd/waiter.go index a1582ab8abe..ac2c9029b25 100644 --- a/pkg/operation/botanist/component/etcd/waiter.go +++ b/pkg/operation/botanist/component/etcd/waiter.go @@ -28,7 +28,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -const ( +var ( // DefaultInterval is the default interval for retry operations. DefaultInterval = 5 * time.Second // DefaultSevereThreshold is the default threshold until an error reported by another component is treated as 'severe'. diff --git a/pkg/operation/botanist/component/etcd/waiter_test.go b/pkg/operation/botanist/component/etcd/waiter_test.go index 1665821d9a0..300a01aa717 100644 --- a/pkg/operation/botanist/component/etcd/waiter_test.go +++ b/pkg/operation/botanist/component/etcd/waiter_test.go @@ -88,7 +88,12 @@ var _ = Describe("#Wait", func() { &retry.UntilTimeout, waiter.UntilTimeout, ) - etcd = New(c, log, testNamespace, sm, testRole, ClassNormal, nil, pointer.Int32(1), "12Gi", pointer.String("abcd"), "", "1.20.1") + etcd = New(log, c, testNamespace, sm, Values{ + Role: testRole, + Class: ClassNormal, + K8sVersion: "1.20.1", + StorageCapacity: "20Gi", + }) etcd.SetHVPAConfig(&HVPAConfig{ Enabled: true, MaintenanceTimeWindow: gardencorev1beta1.MaintenanceTimeWindow{ diff --git a/pkg/operation/botanist/etcd.go b/pkg/operation/botanist/etcd.go index 2f74610fdc2..13c89f1c2c3 100644 --- a/pkg/operation/botanist/etcd.go +++ b/pkg/operation/botanist/etcd.go @@ -16,8 +16,6 @@ package botanist import ( "context" - "fmt" - "hash/crc32" gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1" v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants" @@ -37,7 +35,6 @@ import ( hvpav1alpha1 "github.com/gardener/hvpa-controller/api/v1alpha1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/utils/pointer" ) @@ -57,18 +54,21 @@ func (b *Botanist) DefaultEtcd(role string, class etcd.Class) (etcd.Interface, e } e := NewEtcd( - b.SeedClientSet.Client(), b.Logger, + b.SeedClientSet.Client(), b.Shoot.SeedNamespace, b.SecretsManager, - role, - class, - v1beta1helper.GetFailureToleranceType(b.Shoot.GetInfo()), - replicas, - b.Seed.GetValidVolumeSize("10Gi"), - &defragmentationSchedule, - gardencorev1beta1helper.GetShootCARotationPhase(b.Shoot.GetInfo().Status.Credentials), - b.ShootVersion(), + etcd.Values{ + Role: role, + Class: class, + FailureToleranceType: v1beta1helper.GetFailureToleranceType(b.Shoot.GetInfo()), + Replicas: replicas, + StorageCapacity: b.Seed.GetValidVolumeSize("10Gi"), + DefragmentationSchedule: &defragmentationSchedule, + CARotationPhase: gardencorev1beta1helper.GetShootCARotationPhase(b.Shoot.GetInfo().Status.Credentials), + K8sVersion: b.ShootVersion(), + PriorityClassName: v1beta1constants.PriorityClassNameShootControlPlane500, + }, ) hvpaEnabled := gardenletfeatures.FeatureGate.Enabled(features.HVPA) @@ -192,61 +192,31 @@ func (b *Botanist) scaleETCD(ctx context.Context, replicas int32) error { } func determineBackupSchedule(shoot *gardencorev1beta1.Shoot) (string, error) { - schedule := "%d %d * * *" - - return determineSchedule(shoot, schedule, func(maintenanceTimeWindow *timewindow.MaintenanceTimeWindow, shootUID types.UID) string { - // Randomize the snapshot timing daily but within last hour. - // The 15 minutes buffer is set to snapshot upload time before actual maintenance window start. - snapshotWindowBegin := maintenanceTimeWindow.Begin().Add(-1, -15, 0) - randomMinutes := int(crc32.ChecksumIEEE([]byte(shootUID)) % 60) - snapshotTime := snapshotWindowBegin.Add(0, randomMinutes, 0) - return fmt.Sprintf(schedule, snapshotTime.Minute(), snapshotTime.Hour()) - }) + return timewindow.DetermineSchedule( + "%d %d * * *", + shoot.Spec.Maintenance.TimeWindow.Begin, + shoot.Spec.Maintenance.TimeWindow.End, + shoot.Status.UID, + shoot.CreationTimestamp, + timewindow.RandomizeWithinFirstHourOfTimeWindow, + ) } func determineDefragmentationSchedule(shoot *gardencorev1beta1.Shoot, managedSeed *seedmanagementv1alpha1.ManagedSeed, class etcd.Class) (string, error) { - schedule := "%d %d */3 * *" + scheduleFormat := "%d %d */3 * *" if managedSeed != nil && class == etcd.ClassImportant { // defrag important etcds of ManagedSeeds daily in the maintenance window - schedule = "%d %d * * *" + scheduleFormat = "%d %d * * *" } - return determineSchedule(shoot, schedule, func(maintenanceTimeWindow *timewindow.MaintenanceTimeWindow, shootUID types.UID) string { - // Randomize the defragmentation timing but within the maintenance window. - maintenanceWindowBegin := maintenanceTimeWindow.Begin() - windowInMinutes := uint32(maintenanceTimeWindow.Duration().Minutes()) - randomMinutes := int(crc32.ChecksumIEEE([]byte(shootUID)) % windowInMinutes) - maintenanceTime := maintenanceWindowBegin.Add(0, randomMinutes, 0) - return fmt.Sprintf(schedule, maintenanceTime.Minute(), maintenanceTime.Hour()) - }) -} - -func determineSchedule(shoot *gardencorev1beta1.Shoot, schedule string, f func(*timewindow.MaintenanceTimeWindow, types.UID) string) (string, error) { - var ( - begin, end string - shootUID types.UID + return timewindow.DetermineSchedule( + scheduleFormat, + shoot.Spec.Maintenance.TimeWindow.Begin, + shoot.Spec.Maintenance.TimeWindow.End, + shoot.Status.UID, + shoot.CreationTimestamp, + timewindow.RandomizeWithinTimeWindow, ) - - if shoot.Spec.Maintenance != nil && shoot.Spec.Maintenance.TimeWindow != nil { - begin = shoot.Spec.Maintenance.TimeWindow.Begin - end = shoot.Spec.Maintenance.TimeWindow.End - shootUID = shoot.Status.UID - } - - if len(begin) != 0 && len(end) != 0 { - maintenanceTimeWindow, err := timewindow.ParseMaintenanceTimeWindow(begin, end) - if err != nil { - return "", err - } - - if !maintenanceTimeWindow.Equal(timewindow.AlwaysTimeWindow) { - return f(maintenanceTimeWindow, shootUID), nil - } - } - - creationMinute := shoot.CreationTimestamp.Minute() - creationHour := shoot.CreationTimestamp.Hour() - return fmt.Sprintf(schedule, creationMinute, creationHour), nil } func getEtcdReplicas(shoot *gardencorev1beta1.Shoot) int32 { diff --git a/pkg/operation/botanist/etcd_test.go b/pkg/operation/botanist/etcd_test.go index 33c711f07f3..20d0259e2d0 100644 --- a/pkg/operation/botanist/etcd_test.go +++ b/pkg/operation/botanist/etcd_test.go @@ -464,28 +464,21 @@ type newEtcdValidator struct { } func (v *newEtcdValidator) NewEtcd( - client client.Client, log logr.Logger, + client client.Client, namespace string, secretsManager secretsmanager.Interface, - role string, - class etcd.Class, - _ *gardencorev1beta1.FailureToleranceType, - replicas *int32, - storageCapacity string, - defragmentationSchedule *string, - _ gardencorev1beta1.ShootCredentialsRotationPhase, - _ string, + values etcd.Values, ) etcd.Interface { - Expect(client).To(v.expectedClient) Expect(log).To(v.expectedLogger) + Expect(client).To(v.expectedClient) Expect(namespace).To(v.expectedNamespace) Expect(secretsManager).To(v.expectedSecretsManager) - Expect(role).To(v.expectedRole) - Expect(class).To(v.expectedClass) - Expect(replicas).To(v.expectedReplicas) - Expect(storageCapacity).To(v.expectedStorageCapacity) - Expect(defragmentationSchedule).To(v.expectedDefragmentationSchedule) + Expect(values.Role).To(v.expectedRole) + Expect(values.Class).To(v.expectedClass) + Expect(values.Replicas).To(v.expectedReplicas) + Expect(values.StorageCapacity).To(v.expectedStorageCapacity) + Expect(values.DefragmentationSchedule).To(v.expectedDefragmentationSchedule) return v } diff --git a/pkg/operator/client/scheme.go b/pkg/operator/client/scheme.go index cfaaa9dd627..a62837feaf2 100644 --- a/pkg/operator/client/scheme.go +++ b/pkg/operator/client/scheme.go @@ -16,6 +16,7 @@ package client import ( druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" + hvpav1alpha1 "github.com/gardener/hvpa-controller/api/v1alpha1" apiextensionsinstall "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/install" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -39,6 +40,7 @@ func init() { resourcesv1alpha1.AddToScheme, vpaautoscalingv1.AddToScheme, druidv1alpha1.AddToScheme, + hvpav1alpha1.AddToScheme, ) ) diff --git a/pkg/operator/controller/garden/components.go b/pkg/operator/controller/garden/components.go index a63cf48a88f..d3bda6b9ac3 100644 --- a/pkg/operator/controller/garden/components.go +++ b/pkg/operator/controller/garden/components.go @@ -15,14 +15,20 @@ package garden import ( + hvpav1alpha1 "github.com/gardener/hvpa-controller/api/v1alpha1" + "github.com/go-logr/logr" + "k8s.io/utils/pointer" + v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants" operatorv1alpha1 "github.com/gardener/gardener/pkg/apis/operator/v1alpha1" "github.com/gardener/gardener/pkg/features" "github.com/gardener/gardener/pkg/operation/botanist/component" + "github.com/gardener/gardener/pkg/operation/botanist/component/etcd" "github.com/gardener/gardener/pkg/operation/botanist/component/gardensystem" sharedcomponent "github.com/gardener/gardener/pkg/operation/botanist/component/shared" operatorfeatures "github.com/gardener/gardener/pkg/operator/features" secretsmanager "github.com/gardener/gardener/pkg/utils/secrets/manager" + "github.com/gardener/gardener/pkg/utils/timewindow" ) func (r *Reconciler) newGardenerResourceManager(garden *operatorv1alpha1.Garden, secretsManager secretsmanager.Interface) (component.DeployWaiter, error) { @@ -81,3 +87,79 @@ func (r *Reconciler) newEtcdDruid() (component.DeployWaiter, error) { func (r *Reconciler) newSystem() component.DeployWaiter { return gardensystem.New(r.RuntimeClient, r.GardenNamespace) } + +func (r *Reconciler) newEtcd( + log logr.Logger, + garden *operatorv1alpha1.Garden, + secretsManager secretsmanager.Interface, + role string, + class etcd.Class, +) ( + etcd.Interface, + error, +) { + var ( + hvpaScaleDownUpdateMode *string + defragmentationScheduleFormat string + storageClassName *string + storageCapacity string + ) + + switch role { + case v1beta1constants.ETCDRoleMain: + hvpaScaleDownUpdateMode = pointer.String(hvpav1alpha1.UpdateModeOff) + defragmentationScheduleFormat = "%d %d * * *" // defrag main etcd daily in the maintenance window + storageCapacity = "25Gi" + if etcd := garden.Spec.VirtualCluster.ETCD; etcd != nil && etcd.Main != nil && etcd.Main.Storage != nil { + storageClassName = etcd.Main.Storage.ClassName + if etcd.Main.Storage.Capacity != nil { + storageCapacity = etcd.Main.Storage.Capacity.String() + } + } + + case v1beta1constants.ETCDRoleEvents: + hvpaScaleDownUpdateMode = pointer.String(hvpav1alpha1.UpdateModeMaintenanceWindow) + defragmentationScheduleFormat = "%d %d */3 * *" + storageCapacity = "10Gi" + if etcd := garden.Spec.VirtualCluster.ETCD; etcd != nil && etcd.Events != nil && etcd.Events.Storage != nil { + storageClassName = etcd.Events.Storage.ClassName + if etcd.Events.Storage.Capacity != nil { + storageCapacity = etcd.Events.Storage.Capacity.String() + } + } + } + + defragmentationSchedule, err := timewindow.DetermineSchedule( + defragmentationScheduleFormat, + garden.Spec.VirtualCluster.Maintenance.TimeWindow.Begin, + garden.Spec.VirtualCluster.Maintenance.TimeWindow.End, + garden.UID, + garden.CreationTimestamp, + timewindow.RandomizeWithinTimeWindow, + ) + if err != nil { + return nil, err + } + + return etcd.New( + log, + r.RuntimeClient, + r.GardenNamespace, + secretsManager, + etcd.Values{ + NamePrefix: "virtual-garden-", + Role: role, + Class: class, + Replicas: pointer.Int32(1), + DefragmentationSchedule: &defragmentationSchedule, + StorageCapacity: storageCapacity, + StorageClassName: storageClassName, + PriorityClassName: v1beta1constants.PriorityClassNameGardenSystem500, + HvpaConfig: &etcd.HVPAConfig{ + Enabled: hvpaEnabled(), + MaintenanceTimeWindow: garden.Spec.VirtualCluster.Maintenance.TimeWindow, + ScaleDownUpdateMode: hvpaScaleDownUpdateMode, + }, + }, + ), nil +} diff --git a/pkg/operator/controller/garden/reconciler_delete.go b/pkg/operator/controller/garden/reconciler_delete.go index 2155e853785..51cc45ce250 100644 --- a/pkg/operator/controller/garden/reconciler_delete.go +++ b/pkg/operator/controller/garden/reconciler_delete.go @@ -31,6 +31,7 @@ import ( "github.com/gardener/gardener/pkg/controllerutils" reconcilerutils "github.com/gardener/gardener/pkg/controllerutils/reconciler" "github.com/gardener/gardener/pkg/operation/botanist/component" + "github.com/gardener/gardener/pkg/operation/botanist/component/etcd" "github.com/gardener/gardener/pkg/operation/botanist/component/hvpa" "github.com/gardener/gardener/pkg/operation/botanist/component/vpa" "github.com/gardener/gardener/pkg/utils/flow" @@ -50,6 +51,7 @@ func (r *Reconciler) delete( applier := kubernetes.NewApplier(r.RuntimeClient, r.RuntimeClient.RESTMapper()) log.Info("Instantiating component destroyers") + // garden system components hvpaCRD := hvpa.NewCRD(applier) vpaCRD := vpa.NewCRD(applier, nil) gardenerResourceManager, err := r.newGardenerResourceManager(garden, secretsManager) @@ -70,19 +72,44 @@ func (r *Reconciler) delete( return reconcile.Result{}, err } + // virtual garden control plane components + etcdMain, err := r.newEtcd(log, garden, secretsManager, v1beta1constants.ETCDRoleMain, etcd.ClassImportant) + if err != nil { + return reconcile.Result{}, err + } + etcdEvents, err := r.newEtcd(log, garden, secretsManager, v1beta1constants.ETCDRoleEvents, etcd.ClassNormal) + if err != nil { + return reconcile.Result{}, err + } + var ( - g = flow.NewGraph("Garden deletion") + g = flow.NewGraph("Garden deletion") + destroyEtcd = g.Add(flow.Task{ + Name: "Destroying main and events ETCDs of virtual garden", + Fn: flow.Parallel(etcdMain.Destroy, etcdEvents.Destroy), + }) + waitUntilEtcdDeleted = g.Add(flow.Task{ + Name: "Waiting until main and event ETCDs have been destroyed", + Fn: flow.Parallel(etcdMain.WaitCleanup, etcdEvents.WaitCleanup), + Dependencies: flow.NewTaskIDs(destroyEtcd), + }) + syncPointVirtualGardenControlPlaneDestroyed = flow.NewTaskIDs( + waitUntilEtcdDeleted, + ) destroyEtcdDruid = g.Add(flow.Task{ - Name: "Destroying ETCD Druid", - Fn: component.OpDestroyAndWait(etcdDruid).Destroy, + Name: "Destroying ETCD Druid", + Fn: component.OpDestroyAndWait(etcdDruid).Destroy, + Dependencies: flow.NewTaskIDs(syncPointVirtualGardenControlPlaneDestroyed), }) destroyHVPAController = g.Add(flow.Task{ - Name: "Destroying HVPA controller", - Fn: component.OpDestroyAndWait(hvpaController).Destroy, + Name: "Destroying HVPA controller", + Fn: component.OpDestroyAndWait(hvpaController).Destroy, + Dependencies: flow.NewTaskIDs(syncPointVirtualGardenControlPlaneDestroyed), }) destroyVerticalPodAutoscaler = g.Add(flow.Task{ - Name: "Destroying Kubernetes vertical pod autoscaler", - Fn: component.OpDestroyAndWait(verticalPodAutoscaler).Destroy, + Name: "Destroying Kubernetes vertical pod autoscaler", + Fn: component.OpDestroyAndWait(verticalPodAutoscaler).Destroy, + Dependencies: flow.NewTaskIDs(syncPointVirtualGardenControlPlaneDestroyed), }) syncPointCleanedUp = flow.NewTaskIDs( destroyEtcdDruid, diff --git a/pkg/operator/controller/garden/reconciler_reconcile.go b/pkg/operator/controller/garden/reconciler_reconcile.go index 875f07bf24f..8d3b7291f7f 100644 --- a/pkg/operator/controller/garden/reconciler_reconcile.go +++ b/pkg/operator/controller/garden/reconciler_reconcile.go @@ -28,16 +28,21 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/reconcile" + gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1" + v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants" operatorv1alpha1 "github.com/gardener/gardener/pkg/apis/operator/v1alpha1" resourcesv1alpha1 "github.com/gardener/gardener/pkg/apis/resources/v1alpha1" "github.com/gardener/gardener/pkg/client/kubernetes" "github.com/gardener/gardener/pkg/controllerutils" + gardenletconfig "github.com/gardener/gardener/pkg/gardenlet/apis/config" "github.com/gardener/gardener/pkg/operation/botanist/component" + "github.com/gardener/gardener/pkg/operation/botanist/component/etcd" "github.com/gardener/gardener/pkg/operation/botanist/component/hvpa" "github.com/gardener/gardener/pkg/operation/botanist/component/vpa" "github.com/gardener/gardener/pkg/utils/flow" secretutils "github.com/gardener/gardener/pkg/utils/secrets" secretsmanager "github.com/gardener/gardener/pkg/utils/secrets/manager" + "github.com/gardener/gardener/pkg/utils/timewindow" ) func (r *Reconciler) reconcile( @@ -77,19 +82,20 @@ func (r *Reconciler) reconcile( return reconcile.Result{}, err } - log.Info("Generating general CA certificate for runtime cluster") - if _, err := secretsManager.Generate(ctx, &secretutils.CertificateSecretConfig{ - Name: operatorv1alpha1.SecretNameCARuntime, - CommonName: "garden-runtime", - CertType: secretutils.CACert, - Validity: pointer.Duration(30 * 24 * time.Hour), - }, secretsmanager.Rotate(secretsmanager.KeepOld), secretsmanager.IgnoreOldSecretsAfter(24*time.Hour)); err != nil { - return reconcile.Result{}, err + log.Info("Generating CA certificates for runtime and virtual clusters") + for _, config := range caCertConfigurations() { + if _, err := secretsManager.Generate(ctx, config, caCertGenerateOptionsFor(config.GetName(), "")...); err != nil { + return reconcile.Result{}, err + } } log.Info("Instantiating component deployers") + // garden system components vpaCRD := vpa.NewCRD(applier, nil) hvpaCRD := hvpa.NewCRD(applier) + if !hvpaEnabled() { + hvpaCRD = component.OpDestroy(hvpaCRD) + } gardenerResourceManager, err := r.newGardenerResourceManager(garden, secretsManager) if err != nil { return reconcile.Result{}, err @@ -108,8 +114,14 @@ func (r *Reconciler) reconcile( return reconcile.Result{}, err } - if !hvpaEnabled() { - hvpaCRD = component.OpDestroy(hvpaCRD) + // virtual garden control plane components + etcdMain, err := r.newEtcd(log, garden, secretsManager, v1beta1constants.ETCDRoleMain, etcd.ClassImportant) + if err != nil { + return reconcile.Result{}, err + } + etcdEvents, err := r.newEtcd(log, garden, secretsManager, v1beta1constants.ETCDRoleEvents, etcd.ClassNormal) + if err != nil { + return reconcile.Result{}, err } var ( @@ -127,26 +139,42 @@ func (r *Reconciler) reconcile( Fn: component.OpWait(gardenerResourceManager).Deploy, Dependencies: flow.NewTaskIDs(deployVPACRD, reconcileHVPACRD), }) - _ = g.Add(flow.Task{ + deploySystemResources = g.Add(flow.Task{ Name: "Deploying system resources", Fn: system.Deploy, Dependencies: flow.NewTaskIDs(deployGardenerResourceManager), }) - _ = g.Add(flow.Task{ + deployVPA = g.Add(flow.Task{ Name: "Deploying Kubernetes vertical pod autoscaler", Fn: verticalPodAutoscaler.Deploy, Dependencies: flow.NewTaskIDs(deployGardenerResourceManager), }) - _ = g.Add(flow.Task{ + deployHVPA = g.Add(flow.Task{ Name: "Deploying HVPA controller", Fn: hvpaController.Deploy, Dependencies: flow.NewTaskIDs(deployGardenerResourceManager), }) - _ = g.Add(flow.Task{ + deployEtcdDruid = g.Add(flow.Task{ Name: "Deploying ETCD Druid", Fn: etcdDruid.Deploy, Dependencies: flow.NewTaskIDs(deployGardenerResourceManager), }) + syncPointSystemComponents = flow.NewTaskIDs( + deploySystemResources, + deployVPA, + deployHVPA, + deployEtcdDruid, + ) + deployEtcds = g.Add(flow.Task{ + Name: "Deploying main and events ETCDs of virtual garden", + Fn: r.deployEtcdsFunc(garden, etcdMain, etcdEvents, ""), + Dependencies: flow.NewTaskIDs(syncPointSystemComponents), + }) + _ = g.Add(flow.Task{ + Name: "Waiting until main and event ETCDs report readiness", + Fn: flow.Parallel(etcdMain.Wait, etcdEvents.Wait), + Dependencies: flow.NewTaskIDs(deployEtcds), + }) ) if err := g.Compile().Run(ctx, flow.Opts{Log: log}); err != nil { @@ -155,3 +183,73 @@ func (r *Reconciler) reconcile( return reconcile.Result{}, secretsManager.Cleanup(ctx) } + +func caCertConfigurations() []secretutils.ConfigInterface { + return []secretutils.ConfigInterface{ + &secretutils.CertificateSecretConfig{Name: operatorv1alpha1.SecretNameCARuntime, CertType: secretutils.CACert, Validity: pointer.Duration(30 * 24 * time.Hour)}, + &secretutils.CertificateSecretConfig{Name: v1beta1constants.SecretNameCAETCD, CommonName: "etcd", CertType: secretutils.CACert}, + &secretutils.CertificateSecretConfig{Name: v1beta1constants.SecretNameCAETCDPeer, CommonName: "etcd-peer", CertType: secretutils.CACert}, + } +} + +func caCertGenerateOptionsFor(name string, rotationPhase gardencorev1beta1.ShootCredentialsRotationPhase) []secretsmanager.GenerateOption { + options := []secretsmanager.GenerateOption{secretsmanager.Rotate(secretsmanager.KeepOld)} + + if name == operatorv1alpha1.SecretNameCARuntime { + options = append(options, secretsmanager.IgnoreOldSecretsAfter(24*time.Hour)) + } else if rotationPhase == gardencorev1beta1.RotationCompleting { + options = append(options, secretsmanager.IgnoreOldSecrets()) + } + + return options +} + +func (r *Reconciler) deployEtcdsFunc( + garden *operatorv1alpha1.Garden, + etcdMain, etcdEvents etcd.Interface, + rotationPhase gardencorev1beta1.ShootCredentialsRotationPhase, +) func(context.Context) error { + return func(ctx context.Context) error { + if etcdConfig := garden.Spec.VirtualCluster.ETCD; etcdConfig != nil && etcdConfig.Main != nil && etcdConfig.Main.Backup != nil { + snapshotSchedule, err := timewindow.DetermineSchedule( + "%d %d * * *", + garden.Spec.VirtualCluster.Maintenance.TimeWindow.Begin, + garden.Spec.VirtualCluster.Maintenance.TimeWindow.End, + garden.UID, + garden.CreationTimestamp, + timewindow.RandomizeWithinFirstHourOfTimeWindow, + ) + if err != nil { + return err + } + + var backupLeaderElection *gardenletconfig.ETCDBackupLeaderElection + if r.Config.Controllers.Garden.ETCDConfig != nil { + backupLeaderElection = r.Config.Controllers.Garden.ETCDConfig.BackupLeaderElection + } + + etcdMain.SetBackupConfig(&etcd.BackupConfig{ + Provider: etcdConfig.Main.Backup.Provider, + SecretRefName: etcdConfig.Main.Backup.SecretRef.Name, + Container: etcdConfig.Main.Backup.BucketName, + Prefix: "virtual-garden-etcd-main", + FullSnapshotSchedule: snapshotSchedule, + LeaderElection: backupLeaderElection, + }) + } + + // Roll out the new peer CA first so that every member in the cluster trusts the old and the new CA. + // This is required because peer certificates which are used for client and server authentication at the same time, + // are re-created with the new CA in the `Deploy` step. + if rotationPhase == gardencorev1beta1.RotationPreparing { + if err := flow.Sequential( + flow.Parallel(etcdMain.RolloutPeerCA, etcdEvents.RolloutPeerCA), + flow.Parallel(etcdMain.Wait, etcdEvents.Wait), + )(ctx); err != nil { + return err + } + } + + return flow.Parallel(etcdMain.Deploy, etcdEvents.Deploy)(ctx) + } +} diff --git a/pkg/utils/timewindow/schedule.go b/pkg/utils/timewindow/schedule.go new file mode 100644 index 00000000000..bdb289503b2 --- /dev/null +++ b/pkg/utils/timewindow/schedule.go @@ -0,0 +1,77 @@ +// Copyright (c) 2022 SAP SE or an SAP affiliate company. All rights reserved. This file is licensed under the Apache Software License, v. 2 except as noted otherwise in the LICENSE file +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package timewindow + +import ( + "fmt" + "hash/crc32" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// MutateScheduleFunc is a function for mutating the schedule based on the maintenance time window and UID. +type MutateScheduleFunc func(string, MaintenanceTimeWindow, types.UID) string + +// DetermineSchedule determines a schedule based on the provided format and the creation timestamp. If both the begin +// and end of a maintenance time window are provided and different from the 'always time window' then the provided +// mutation function is applied. +func DetermineSchedule( + scheduleFormat string, + begin, end string, + uid types.UID, + creationTimestamp metav1.Time, + mutate MutateScheduleFunc, +) ( + string, + error, +) { + if len(begin) != 0 && len(end) != 0 { + maintenanceTimeWindow, err := ParseMaintenanceTimeWindow(begin, end) + if err != nil { + return "", err + } + + if !maintenanceTimeWindow.Equal(AlwaysTimeWindow) { + return mutate(scheduleFormat, *maintenanceTimeWindow, uid), nil + } + } + + return fmt.Sprintf(scheduleFormat, creationTimestamp.Minute(), creationTimestamp.Hour()), nil +} + +// RandomizeWithinTimeWindow computes a random time (based on the provided UID) within the provided time window. +func RandomizeWithinTimeWindow(scheduleFormat string, window MaintenanceTimeWindow, uid types.UID) string { + var ( + windowBegin = window.Begin() + windowInMinutes = uint32(window.Duration().Minutes()) + randomMinutes = int(crc32.ChecksumIEEE([]byte(uid)) % windowInMinutes) + randomTime = windowBegin.Add(0, randomMinutes, 0) + ) + + return fmt.Sprintf(scheduleFormat, randomTime.Minute(), randomTime.Hour()) +} + +// RandomizeWithinFirstHourOfTimeWindow computes a random time (based on the provided UID) within the first hour of the +// provided time window. It adds a 15 minutes time buffer before the start. +func RandomizeWithinFirstHourOfTimeWindow(scheduleFormat string, window MaintenanceTimeWindow, uid types.UID) string { + var ( + windowBegin = window.Begin().Add(0, -15, 0) + randomMinutes = int(crc32.ChecksumIEEE([]byte(uid)) % 60) + randomTime = windowBegin.Add(-1, randomMinutes, 0) + ) + + return fmt.Sprintf(scheduleFormat, randomTime.Minute(), randomTime.Hour()) +} diff --git a/pkg/utils/timewindow/schedule_test.go b/pkg/utils/timewindow/schedule_test.go new file mode 100644 index 00000000000..04a68eb2b35 --- /dev/null +++ b/pkg/utils/timewindow/schedule_test.go @@ -0,0 +1,76 @@ +// Copyright (c) 2022 SAP SE or an SAP affiliate company. All rights reserved. This file is licensed under the Apache Software License, v. 2 except as noted otherwise in the LICENSE file +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package timewindow_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + . "github.com/gardener/gardener/pkg/utils/timewindow" +) + +var _ = Describe("Schedule", func() { + var ( + scheduleFormat = "%d %d" + uid = types.UID("uid") + window = NewMaintenanceTimeWindow( + NewMaintenanceTime(14, 0, 0), + NewMaintenanceTime(22, 0, 0), + ) + ) + + Describe("#DetermineSchedule", func() { + var ( + begin = "140000+0100" + end = "220000+0100" + creationTimestamp = metav1.Time{} + mutate = func(string, MaintenanceTimeWindow, types.UID) string { + return "foo" + } + ) + + It("should return an error because the time window cannot be parsed", func() { + schedule, err := DetermineSchedule(scheduleFormat, begin, "not-parseable", uid, creationTimestamp, mutate) + Expect(err).To(HaveOccurred()) + Expect(schedule).To(BeEmpty()) + }) + + It("should use the mutate function", func() { + schedule, err := DetermineSchedule(scheduleFormat, begin, end, uid, creationTimestamp, mutate) + Expect(err).NotTo(HaveOccurred()) + Expect(schedule).To(Equal("foo")) + }) + + It("should not use the mutate function because time window is equal to always window", func() { + schedule, err := DetermineSchedule(scheduleFormat, "000000+0000", "235959+0000", uid, creationTimestamp, mutate) + Expect(err).NotTo(HaveOccurred()) + Expect(schedule).To(Equal("0 0")) + }) + }) + + Describe("#RandomizeWithinTimeWindow", func() { + It("should compute a pseudo-randomized time within the time window", func() { + Expect(RandomizeWithinTimeWindow(scheduleFormat, *window, uid)).To(Equal("10 15")) + }) + }) + + Describe("#RandomizeWithinFirstHourOfTimeWindow", func() { + It("should compute a pseudo-randomized time within the first hour of the time window", func() { + Expect(RandomizeWithinFirstHourOfTimeWindow(scheduleFormat, *window, uid)).To(Equal("55 12")) + }) + }) +}) diff --git a/test/e2e/operator/garden/create_delete.go b/test/e2e/operator/garden/create_delete.go index c58d55b6c8d..573fa64111d 100644 --- a/test/e2e/operator/garden/create_delete.go +++ b/test/e2e/operator/garden/create_delete.go @@ -19,6 +19,7 @@ import ( "os" "time" + druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" . "github.com/onsi/gomega/gstruct" @@ -46,7 +47,9 @@ var _ = Describe("Garden Tests", Label("Garden", "default"), func() { var ( parentCtx = context.Background() runtimeClient client.Client - garden *operatorv1alpha1.Garden + + backupSecret *corev1.Secret + garden *operatorv1alpha1.Garden ) BeforeEach(func() { @@ -56,6 +59,15 @@ var _ = Describe("Garden Tests", Label("Garden", "default"), func() { runtimeClient, err = client.New(restConfig, client.Options{Scheme: operatorclient.RuntimeScheme}) Expect(err).NotTo(HaveOccurred()) + backupSecret = &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "virtual-garden-etcd-main-backup", + Namespace: "garden", + }, + Type: corev1.SecretTypeOpaque, + Data: map[string][]byte{"hostPath": []byte("/etc/gardener/local-backupbuckets")}, + } + garden = &operatorv1alpha1.Garden{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "garden-", @@ -71,6 +83,26 @@ var _ = Describe("Garden Tests", Label("Garden", "default"), func() { }, }, }, + VirtualCluster: operatorv1alpha1.VirtualCluster{ + ETCD: &operatorv1alpha1.ETCD{ + Main: &operatorv1alpha1.ETCDMain{ + Backup: &operatorv1alpha1.Backup{ + Provider: "local", + BucketName: "gardener-operator", + SecretRef: corev1.SecretReference{ + Name: backupSecret.Name, + Namespace: backupSecret.Namespace, + }, + }, + }, + }, + Maintenance: operatorv1alpha1.Maintenance{ + TimeWindow: gardencorev1beta1.MaintenanceTimeWindow{ + Begin: "220000+0100", + End: "230000+0100", + }, + }, + }, }, } }) @@ -80,6 +112,7 @@ var _ = Describe("Garden Tests", Label("Garden", "default"), func() { ctx, cancel := context.WithTimeout(parentCtx, 2*time.Minute) defer cancel() + Expect(runtimeClient.Create(ctx, backupSecret)).To(Succeed()) Expect(runtimeClient.Create(ctx, garden)).To(Succeed()) CEventually(ctx, func(g Gomega) []gardencorev1beta1.Condition { g.Expect(runtimeClient.Get(ctx, client.ObjectKeyFromObject(garden), garden)).To(Succeed()) @@ -98,11 +131,22 @@ var _ = Describe("Garden Tests", Label("Garden", "default"), func() { )) }).WithPolling(2 * time.Second).Should(Succeed()) + CEventually(ctx, func(g Gomega) []druidv1alpha1.Etcd { + etcdList := &druidv1alpha1.EtcdList{} + g.Expect(runtimeClient.List(ctx, etcdList, client.InNamespace("garden"))).To(Succeed()) + return etcdList.Items + }).Should(ConsistOf( + healthyEtcd("virtual-garden-etcd-main"), + healthyEtcd("virtual-garden-etcd-events"), + )) + By("Delete Garden") ctx, cancel = context.WithTimeout(parentCtx, 20*time.Minute) defer cancel() Expect(runtimeClient.Delete(ctx, garden)).To(Succeed()) + Expect(runtimeClient.Delete(ctx, backupSecret)).To(Succeed()) + CEventually(ctx, func() error { return runtimeClient.Get(ctx, client.ObjectKeyFromObject(garden), garden) }).WithPolling(2 * time.Second).Should(BeNotFoundError()) @@ -133,3 +177,10 @@ func healthyManagedResource(name string) gomegatypes.GomegaMatcher { )}), }) } + +func healthyEtcd(name string) gomegatypes.GomegaMatcher { + return MatchFields(IgnoreExtras, Fields{ + "ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal(name)}), + "Status": MatchFields(IgnoreExtras, Fields{"Ready": PointTo(BeTrue())}), + }) +} diff --git a/test/integration/gardenlet/seed/seed/seed_test.go b/test/integration/gardenlet/seed/seed/seed_test.go index 7b969e01832..bfd12f13244 100644 --- a/test/integration/gardenlet/seed/seed/seed_test.go +++ b/test/integration/gardenlet/seed/seed/seed_test.go @@ -415,6 +415,16 @@ var _ = Describe("Seed controller tests", func() { testID: testRunID, }, }, + Spec: operatorv1alpha1.GardenSpec{ + VirtualCluster: operatorv1alpha1.VirtualCluster{ + Maintenance: operatorv1alpha1.Maintenance{ + TimeWindow: gardencorev1beta1.MaintenanceTimeWindow{ + Begin: "220000+0100", + End: "230000+0100", + }, + }, + }, + }, } Expect(testClient.Create(ctx, garden)).To(Succeed()) log.Info("Created Garden for test", "garden", garden.Name) diff --git a/test/integration/operator/garden/garden_suite_test.go b/test/integration/operator/garden/garden_suite_test.go index 793d1f7417a..d11ca62d8de 100644 --- a/test/integration/operator/garden/garden_suite_test.go +++ b/test/integration/operator/garden/garden_suite_test.go @@ -76,7 +76,12 @@ var _ = BeforeSuite(func() { By("starting test environment") testEnv = &envtest.Environment{ CRDInstallOptions: envtest.CRDInstallOptions{ - Paths: []string{filepath.Join("..", "..", "..", "..", "example", "operator", "10-crd-operator.gardener.cloud_gardens.yaml")}, + Paths: []string{ + filepath.Join("..", "..", "..", "..", "example", "operator", "10-crd-operator.gardener.cloud_gardens.yaml"), + // This CRD would be installed by gardener-resource-manager (GRM) in a real system, however in this + // integration test GRM is not running. Hence, we have to create it manually to satisfy the test setup. + filepath.Join("..", "..", "..", "..", "example", "seed-crds", "10-crd-druid.gardener.cloud_etcds.yaml"), + }, }, ErrorIfCRDPathMissing: true, } diff --git a/test/integration/operator/garden/garden_test.go b/test/integration/operator/garden/garden_test.go index 3aea2014541..784973b1f38 100644 --- a/test/integration/operator/garden/garden_test.go +++ b/test/integration/operator/garden/garden_test.go @@ -15,6 +15,9 @@ package garden_test import ( + "time" + + druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" . "github.com/onsi/gomega/gstruct" @@ -31,6 +34,7 @@ import ( resourcesv1alpha1 "github.com/gardener/gardener/pkg/apis/resources/v1alpha1" "github.com/gardener/gardener/pkg/controllerutils" "github.com/gardener/gardener/pkg/features" + "github.com/gardener/gardener/pkg/operation/botanist/component/etcd" operatorfeatures "github.com/gardener/gardener/pkg/operator/features" secretutils "github.com/gardener/gardener/pkg/utils/secrets" "github.com/gardener/gardener/pkg/utils/test" @@ -43,6 +47,10 @@ var _ = Describe("Garden controller tests", func() { BeforeEach(func() { DeferCleanup(test.WithVar(&secretutils.GenerateKey, secretutils.FakeGenerateKey)) DeferCleanup(test.WithFeatureGate(operatorfeatures.FeatureGate, features.HVPA, true)) + DeferCleanup(test.WithVars( + &etcd.DefaultInterval, 100*time.Millisecond, + &etcd.DefaultTimeout, 500*time.Millisecond, + )) garden = &operatorv1alpha1.Garden{ ObjectMeta: metav1.ObjectMeta{ @@ -60,6 +68,14 @@ var _ = Describe("Garden controller tests", func() { }, }, }, + VirtualCluster: operatorv1alpha1.VirtualCluster{ + Maintenance: operatorv1alpha1.Maintenance{ + TimeWindow: gardencorev1beta1.MaintenanceTimeWindow{ + Begin: "220000+0100", + End: "230000+0100", + }, + }, + }, }, } @@ -109,7 +125,7 @@ var _ = Describe("Garden controller tests", func() { MatchFields(IgnoreExtras, Fields{"ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal("hvpas.autoscaling.k8s.io")})}), )) - By("Verify that CA secret was generated") + By("Verify that garden runtime CA secret was generated") Eventually(func(g Gomega) []corev1.Secret { secretList := &corev1.SecretList{} g.Expect(testClient.List(ctx, secretList, client.InNamespace(testNamespace.Name), client.MatchingLabels{"name": "ca-garden-runtime", "managed-by": "secrets-manager", "manager-identity": "gardener-operator"})).To(Succeed()) @@ -161,6 +177,35 @@ var _ = Describe("Garden controller tests", func() { MatchFields(IgnoreExtras, Fields{"ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal("etcd-druid")})}), )) + By("Verify that the virtual garden control plane components have been deployed") + Eventually(func(g Gomega) []druidv1alpha1.Etcd { + etcdList := &druidv1alpha1.EtcdList{} + g.Expect(testClient.List(ctx, etcdList, client.InNamespace(testNamespace.Name))).To(Succeed()) + return etcdList.Items + }).Should(ConsistOf( + MatchFields(IgnoreExtras, Fields{"ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal("virtual-garden-etcd-main")})}), + MatchFields(IgnoreExtras, Fields{"ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal("virtual-garden-etcd-events")})}), + )) + + // The garden controller waits for the Etcd resources to be healthy, but etcd-druid is not really running in + // this test, so let's fake this here. + By("Patch Etcd resources to report healthiness") + Eventually(func(g Gomega) { + for _, suffix := range []string{"main", "events"} { + etcd := &druidv1alpha1.Etcd{ObjectMeta: metav1.ObjectMeta{Name: "virtual-garden-etcd-" + suffix, Namespace: testNamespace.Name}} + g.Expect(testClient.Get(ctx, client.ObjectKeyFromObject(etcd), etcd)).To(Succeed(), "for "+etcd.Name) + + patch := client.MergeFrom(etcd.DeepCopy()) + delete(etcd.Annotations, "gardener.cloud/operation") + g.Expect(testClient.Patch(ctx, etcd, patch)).To(Succeed(), "for "+etcd.Name) + + patch = client.MergeFrom(etcd.DeepCopy()) + etcd.Status.ObservedGeneration = &etcd.Generation + etcd.Status.Ready = pointer.Bool(true) + g.Expect(testClient.Status().Patch(ctx, etcd, patch)).To(Succeed(), "for "+etcd.Name) + } + }).Should(Succeed()) + By("Wait for Reconciled condition to be set to True") Eventually(func(g Gomega) []gardencorev1beta1.Condition { g.Expect(testClient.Get(ctx, client.ObjectKeyFromObject(garden), garden)).To(Succeed()) @@ -170,6 +215,16 @@ var _ = Describe("Garden controller tests", func() { By("Delete Garden") Expect(testClient.Delete(ctx, garden)).To(Succeed()) + By("Verify that the virtual garden control plane components have been deleted") + Eventually(func(g Gomega) []druidv1alpha1.Etcd { + etcdList := &druidv1alpha1.EtcdList{} + g.Expect(testClient.List(ctx, etcdList)).To(Succeed()) + return etcdList.Items + }).ShouldNot(ContainElements( + MatchFields(IgnoreExtras, Fields{"ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal("virtual-garden-etcd-main")})}), + MatchFields(IgnoreExtras, Fields{"ObjectMeta": MatchFields(IgnoreExtras, Fields{"Name": Equal("virtual-garden-etcd-events")})}), + )) + By("Verify that the garden system components have been deleted") // When the controller succeeds then it deletes the `ManagedResource` CRD, so we only need to ensure here that // the `ManagedResource` API is no longer available. @@ -195,6 +250,13 @@ var _ = Describe("Garden controller tests", func() { return testClient.Get(ctx, client.ObjectKeyFromObject(deployment), deployment) }).Should(BeNotFoundError()) + By("Verify that secrets have been deleted") + Eventually(func(g Gomega) []corev1.Secret { + secretList := &corev1.SecretList{} + g.Expect(testClient.List(ctx, secretList, client.InNamespace(testNamespace.Name), client.MatchingLabels{"managed-by": "secrets-manager", "manager-identity": "gardener-operator"})).To(Succeed()) + return secretList.Items + }).Should(BeEmpty()) + By("Ensure Garden is gone") Eventually(func() error { return testClient.Get(ctx, client.ObjectKeyFromObject(garden), garden)