From 8d2a5712e509c37db72aeefa8251cdeb4ddb4cd9 Mon Sep 17 00:00:00 2001 From: Daichi Mukai Date: Thu, 16 Jun 2022 11:30:59 +0000 Subject: [PATCH 1/2] reconcile: do not resize volume if failed to get inode stats The pvc-autoresizer decides whether each volume needs to be resized based on volume stats. This stats consist of available bytes, capacity, free inode count and all inode counts. The controller gets these values from prometheus metrics. Without this patch, if metrics lack inodes counts for some target volumes, decision on resize necessity are done as if the lacking values are zero. This causes unnecessary resizes for such volumes, and may continues to resize up to the volume storage limit. This patch changes the reconciler to resize volumes only if the controller succeeded to get all four volume stats. Fixes: f7e1be3ceb9e ("Add inode checking feature") --- runners/metrics_client.go | 16 +++++++++++++--- runners/pvc_autoresizer.go | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/runners/metrics_client.go b/runners/metrics_client.go index 40f3da09..c6c616ef 100644 --- a/runners/metrics_client.go +++ b/runners/metrics_client.go @@ -36,6 +36,11 @@ func NewPrometheusClient(url string) (MetricsClient, error) { // MetricsClient is an interface for getting metrics type MetricsClient interface { + // GetMetrics returns volume stats metrics of PVCs + // + // The volume stats consist of available bytes, capacity bytes, available inodes and capacity + // inodes. This method returns volume stats for a PVC only if all four metrics of the PVC was + // retrieved from the metrics source. GetMetrics(ctx context.Context) (map[types.NamespacedName]*VolumeStats, error) } @@ -51,6 +56,7 @@ type prometheusClient struct { prometheusAPI prometheusv1.API } +// GetMetrics implements MetricsClient.GetMetrics func (c *prometheusClient) GetMetrics(ctx context.Context) (map[types.NamespacedName]*VolumeStats, error) { volumeStatsMap := make(map[types.NamespacedName]*VolumeStats) @@ -76,16 +82,20 @@ func (c *prometheusClient) GetMetrics(ctx context.Context) (map[types.Namespaced for key, val := range availableBytes { vs := &VolumeStats{AvailableBytes: val} - if cb, ok := capacityBytes[key]; !ok { - continue - } else { + if cb, ok := capacityBytes[key]; ok { vs.CapacityBytes = cb + } else { + continue } if ais, ok := availableInodeSize[key]; ok { vs.AvailableInodeSize = ais + } else { + continue } if cis, ok := capacityInodeSize[key]; ok { vs.CapacityInodeSize = cis + } else { + continue } volumeStatsMap[key] = vs } diff --git a/runners/pvc_autoresizer.go b/runners/pvc_autoresizer.go index 41074428..c72299cf 100644 --- a/runners/pvc_autoresizer.go +++ b/runners/pvc_autoresizer.go @@ -118,10 +118,11 @@ func (w *pvcAutoresizer) reconcile(ctx context.Context) error { return nil } for _, pvc := range pvcs.Items { + log := w.log.WithValues("namespace", pvc.Namespace, "name", pvc.Name) isTarget, err := isTargetPVC(&pvc) if err != nil { metrics.ResizerFailedResizeTotal.Increment(pvc.Name, pvc.Namespace) - w.log.WithValues("namespace", pvc.Namespace, "name", pvc.Name).Error(err, "failed to check target PVC") + log.Error(err, "failed to check target PVC") continue } else if !isTarget { continue @@ -131,12 +132,22 @@ func (w *pvcAutoresizer) reconcile(ctx context.Context) error { Name: pvc.Name, } if _, ok := vsMap[namespacedName]; !ok { + // Do not increment ResizerFailedResizeTotal here. The controller cannot get volume + // stats for "offline" volumes (i.e. volumes not mounted by any pod) since kubelet + // exports volume stats of a persistent volume claim only if it is online. Besides, + // NodeExpandVolume RPC assumes that the volume to be published or staged on a node + // (and hence online), the resize request of controller for offline PVC will not be + // processed for the time being. So, we do not regard it as a resize failure that + // the controller failed to retrieve volume stats for the PVC. This may result in a + // failure to increment the counter in the case which the PVC is online but fails + // to retrieve its metrics, but accept this as a limitation for now. + log.Info("failed to get volume stats") continue } err = w.resize(ctx, &pvc, vsMap[namespacedName]) if err != nil { metrics.ResizerFailedResizeTotal.Increment(pvc.Name, pvc.Namespace) - w.log.WithValues("namespace", pvc.Namespace, "name", pvc.Name).Error(err, "failed to resize PVC") + log.Error(err, "failed to resize PVC") } } } From 35410dd47f78f03e3a6f302494ec6fe5f3c0c287 Mon Sep 17 00:00:00 2001 From: Daichi Mukai Date: Tue, 21 Jun 2022 04:07:04 +0000 Subject: [PATCH 2/2] e2e: bump TopoLVM version The e2e test depends on topolvm's example which had flakiness due to insufficient wait for webhook readiness. This is addressed in topolvm commit 31f8d5329c7e ("example: wait for topolvm controller mutating webhook to become ready"). Use this commit if possible to reduce CI flakiness. --- e2e/Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/e2e/Makefile b/e2e/Makefile index cf12cf9d..f47974e3 100644 --- a/e2e/Makefile +++ b/e2e/Makefile @@ -3,15 +3,19 @@ TEST_KUBERNETES_TARGET ?= current ifeq ($(TEST_KUBERNETES_TARGET),current) KUBERNETES_VERSION := 1.23.3 -TOPOLVM_VERSION := 0.11.1 +# In This commit, flakiness of CI due to webhook readiness is addressed. +# todo: Use tag if this commit is released +TOPOLVM_VERSION := 31f8d5329c7e518240fb33681245e43f2d3491f3 else ifeq ($(TEST_KUBERNETES_TARGET),prev) KUBERNETES_VERSION := 1.22.4 -TOPOLVM_VERSION := 0.11.1 +# In This commit, flakiness of CI due to webhook readiness is addressed. +# todo: Use tag if this commit is released +TOPOLVM_VERSION := 31f8d5329c7e518240fb33681245e43f2d3491f3 else ifeq ($(TEST_KUBERNETES_TARGET),prev2) KUBERNETES_VERSION := 1.21.2 # Kubernetes 1.21 does not support KubeSchedulerConfiguration API version v1beta2, # so use older version of TopoLVM -TOPOLVM_VERSION := 0.10.4 +TOPOLVM_VERSION := v0.10.4 endif export KUBERNETES_VERSION @@ -76,7 +80,7 @@ endef $(TMPDIR)/topolvm: git clone https://github.com/topolvm/topolvm.git $@ - cd $@ && git checkout v$(TOPOLVM_VERSION) + cd $@ && git checkout $(TOPOLVM_VERSION) make -C $(TMPDIR)/topolvm/example setup autoresizer.img: