Skip to content

Commit 2802f23

Browse files
author
mervynwang
committed
support recommend with pod metric
1 parent 2b0ddae commit 2802f23

File tree

5 files changed

+232
-66
lines changed

5 files changed

+232
-66
lines changed

pkg/features/features.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ const (
3636

3737
// QOSInitializer enables the qos initialization featrues.
3838
QOSInitializer featuregate.Feature = "QOSInitializer"
39+
40+
// EnablePodRecommendation enables the pod recommendation features.
41+
EnablePodRecommendation featuregate.Feature = "EnablePodRecommendation"
3942
)
4043

4144
var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
@@ -49,6 +52,7 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
4952
CraneCPUManager: {Default: false, PreRelease: featuregate.Alpha},
5053
QOSInitializer: {Default: false, PreRelease: featuregate.Alpha},
5154
CraneDashboardControl: {Default: false, PreRelease: featuregate.Alpha},
55+
EnablePodRecommendation: {Default: false, PreRelease: featuregate.Alpha},
5256
}
5357

5458
func init() {

pkg/metricnaming/naming.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,22 @@ func ResourceToWorkloadMetricNamer(target *corev1.ObjectReference, resourceName
7676
}
7777
}
7878

79+
func ResourceToPodMetricNamer(namespace, podName string, resourceName corev1.ResourceName, caller string) MetricNamer {
80+
// pod
81+
return &GeneralMetricNamer{
82+
CallerName: caller,
83+
Metric: &metricquery.Metric{
84+
Type: metricquery.PodMetricType,
85+
MetricName: resourceName.String(),
86+
Pod: &metricquery.PodNamerInfo{
87+
Namespace: namespace,
88+
Name: podName,
89+
Selector: labels.Everything(),
90+
},
91+
},
92+
}
93+
}
94+
7995
func ResourceToContainerMetricNamer(namespace, apiVersion, workloadKind, workloadName, containerName string, resourceName corev1.ResourceName, caller string) MetricNamer {
8096
// container
8197
return &GeneralMetricNamer{

pkg/recommend/types/types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ type EffectiveHorizontalPodAutoscalerRecommendation struct {
5353
}
5454

5555
type ResourceRequestRecommendation struct {
56+
Pod PodRecommendation `json:"pod,omitempty"`
5657
Containers []ContainerRecommendation `json:"containers,omitempty"`
5758
}
5859

@@ -61,4 +62,9 @@ type ContainerRecommendation struct {
6162
Target ResourceList `json:"target,omitempty"`
6263
}
6364

65+
type PodRecommendation struct {
66+
PodName string `json:"podName,omitempty"`
67+
Target ResourceList `json:"target,omitempty"`
68+
}
69+
6470
type ResourceList map[corev1.ResourceName]string

pkg/recommendation/recommender/resource/recommend.go

Lines changed: 203 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,19 @@ import (
99

1010
corev1 "k8s.io/api/core/v1"
1111
"k8s.io/apimachinery/pkg/api/resource"
12+
"k8s.io/apimachinery/pkg/util/errors"
13+
utilfeature "k8s.io/apiserver/pkg/util/feature"
1214
recommendermodel "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/model"
1315
"k8s.io/klog/v2"
1416
"sigs.k8s.io/yaml"
1517

1618
predictionapi "github.com/gocrane/api/prediction/v1alpha1"
1719

20+
"github.com/gocrane/crane/pkg/common"
21+
"github.com/gocrane/crane/pkg/features"
1822
"github.com/gocrane/crane/pkg/metricnaming"
1923
"github.com/gocrane/crane/pkg/oom"
24+
"github.com/gocrane/crane/pkg/prediction"
2025
"github.com/gocrane/crane/pkg/prediction/config"
2126
"github.com/gocrane/crane/pkg/recommend/types"
2227
"github.com/gocrane/crane/pkg/recommendation/framework"
@@ -90,6 +95,8 @@ func (rr *ResourceRecommender) Recommend(ctx *framework.RecommendationContext) e
9095
}
9196

9297
resourceRecommendation := &types.ResourceRequestRecommendation{}
98+
namespace := ctx.Object.GetNamespace()
99+
caller := fmt.Sprintf(callerFormat, klog.KObj(ctx.Recommendation), ctx.Recommendation.UID)
93100

94101
var newContainers []corev1.Container
95102
var oldContainers []corev1.Container
@@ -99,82 +106,56 @@ func (rr *ResourceRecommender) Recommend(ctx *framework.RecommendationContext) e
99106
return err
100107
}
101108

102-
namespace := ctx.Object.GetNamespace()
103-
for _, c := range ctx.Pods[0].Spec.Containers {
104-
cr := types.ContainerRecommendation{
105-
ContainerName: c.Name,
106-
Target: map[corev1.ResourceName]string{},
107-
}
108-
109-
caller := fmt.Sprintf(callerFormat, klog.KObj(ctx.Recommendation), ctx.Recommendation.UID)
110-
metricNamer := metricnaming.ResourceToContainerMetricNamer(namespace, ctx.Recommendation.Spec.TargetRef.APIVersion,
111-
ctx.Recommendation.Spec.TargetRef.Kind, ctx.Recommendation.Spec.TargetRef.Name, c.Name, corev1.ResourceCPU, caller)
112-
klog.Infof("%s: CPU query for resource request recommendation: %s", ctx.String(), metricNamer.BuildUniqueKey())
113-
cpuConfig := rr.makeCpuConfig()
114-
tsList, err := utils.QueryPredictedValuesOnce(ctx.Recommendation, predictor, caller, cpuConfig, metricNamer)
109+
// pod
110+
if utilfeature.DefaultFeatureGate.Enabled(features.EnablePodRecommendation) {
111+
cpuTsList, memoryTsList, usePodMetrics, err := rr.getPodCpuAndMemoryTsList(ctx, namespace, caller, predictor)
115112
if err != nil {
116-
return err
113+
klog.Warningf("getPodCpuAndMemoryTsList err: %v", err)
117114
}
118-
if len(tsList) < 1 || len(tsList[0].Samples) < 1 {
119-
return fmt.Errorf("no value retured for queryExpr: %s", metricNamer.BuildUniqueKey())
120-
}
121-
// Check timestamp is completed
122-
if rr.HistoryCompletionCheck {
123-
completion, existDays, err := utils.DetectTimestampCompletion(tsList, rr.CpuModelHistoryLength, time.Now())
124-
if !completion || err != nil {
125-
return fmt.Errorf("%s: cpu timestamps are not completed, expect %s actual %d days", metricNamer.BuildUniqueKey(), rr.CpuModelHistoryLength, existDays)
115+
if usePodMetrics {
116+
klog.V(4).Infof("use pod metrics for pod %s", ctx.Pods[0].Name)
117+
pr := types.PodRecommendation{
118+
PodName: ctx.Pods[0].Name,
119+
Target: map[corev1.ResourceName]string{},
126120
}
127-
}
128121

129-
v := int64(tsList[0].Samples[0].Value * 1000)
130-
cpuQuantity := resource.NewMilliQuantity(v, resource.DecimalSI)
131-
klog.Infof("%s: container %s recommended cpu %s", ctx.String(), c.Name, cpuQuantity.String())
122+
cpuQuantity, memQuantity, err := rr.recommendCpuAndMemResources(ctx, cpuTsList, memoryTsList, oomRecords, namespace, ctx.Object.GetName(), ctx.Pods[0].Name)
123+
if err != nil {
124+
klog.Errorf("recommendCpuAndMemResources %v", err)
125+
}
132126

133-
metricNamer = metricnaming.ResourceToContainerMetricNamer(namespace, ctx.Recommendation.Spec.TargetRef.APIVersion,
134-
ctx.Recommendation.Spec.TargetRef.Kind, ctx.Recommendation.Spec.TargetRef.Name, c.Name, corev1.ResourceMemory, caller)
135-
klog.Infof("%s Memory query for resource request recommendation: %s", ctx.String(), metricNamer.BuildUniqueKey())
136-
memConfig := rr.makeMemConfig()
137-
tsList, err = utils.QueryPredictedValuesOnce(ctx.Recommendation, predictor, caller, memConfig, metricNamer)
127+
if cpuQuantity != nil {
128+
pr.Target[corev1.ResourceCPU] = cpuQuantity.String()
129+
}
130+
if memQuantity != nil {
131+
pr.Target[corev1.ResourceMemory] = memQuantity.String()
132+
}
133+
134+
resourceRecommendation.Pod = pr
135+
} else {
136+
klog.V(4).Infof("not use pod metrics for pod %s", ctx.Pods[0].Name)
137+
}
138+
}
139+
140+
// containers
141+
for _, c := range ctx.Pods[0].Spec.Containers {
142+
cpuTsList, memTsList, err := rr.getContainerCpuAndMemoryTsList(ctx, predictor, caller, namespace, c.Name)
138143
if err != nil {
139144
return err
140145
}
141-
if len(tsList) < 1 || len(tsList[0].Samples) < 1 {
142-
return fmt.Errorf("no value retured for queryExpr: %s", metricNamer.BuildUniqueKey())
143-
}
144-
// Check timestamp is completed
145-
if rr.HistoryCompletionCheck {
146-
completion, existDays, err := utils.DetectTimestampCompletion(tsList, rr.MemHistoryLength, time.Now())
147-
if !completion || err != nil {
148-
return fmt.Errorf("%s: memory timestamps are not completed, expect %s actual %d days ", metricNamer.BuildUniqueKey(), rr.MemHistoryLength, existDays)
149-
}
150-
}
151146

152-
v = int64(tsList[0].Samples[0].Value)
153-
if v <= 0 {
154-
return fmt.Errorf("no enough metrics")
147+
cpuQuantity, memQuantity, err := rr.recommendCpuAndMemResources(ctx, cpuTsList, memTsList, oomRecords, namespace, ctx.Object.GetName(), c.Name)
148+
if err != nil {
149+
return err
155150
}
156-
memQuantity := resource.NewQuantity(v, resource.BinarySI)
157-
klog.Infof("%s: container %s recommended memory %s", ctx.String(), c.Name, memQuantity.String())
158-
159-
// Use oom protected memory if exist
160-
if rr.OOMProtection {
161-
oomProtectMem := rr.MemoryOOMProtection(oomRecords, namespace, ctx.Object.GetName(), c.Name)
162-
if oomProtectMem != nil && !oomProtectMem.IsZero() && oomProtectMem.Cmp(*memQuantity) > 0 {
163-
klog.Infof("%s: container %s using oomProtect Memory %s", ctx.String(), c.Name, oomProtectMem.String())
164-
memQuantity = oomProtectMem
165-
}
151+
if cpuQuantity == nil || memQuantity == nil {
152+
return fmt.Errorf("resource recommendation failed for container %s: cpu=%v, memory=%v", c.Name, cpuQuantity != nil, memQuantity != nil)
166153
}
167154

168-
// Resource Specification enabled
169-
if rr.Specification {
170-
normalizedCpu, normalizedMem := GetNormalizedResource(cpuQuantity, memQuantity, rr.SpecificationConfigs)
171-
klog.Infof("GetNormalizedResource currentCpu %s normalizedCpu %s currentMem %s normalizedMem %s", cpuQuantity.String(), normalizedCpu.String(), memQuantity.String(), normalizedMem.String())
172-
if normalizedCpu.Value() > 0 && normalizedMem.Value() > 0 {
173-
cpuQuantity = &normalizedCpu
174-
memQuantity = &normalizedMem
175-
}
155+
cr := types.ContainerRecommendation{
156+
ContainerName: c.Name,
157+
Target: map[corev1.ResourceName]string{},
176158
}
177-
178159
cr.Target[corev1.ResourceCPU] = cpuQuantity.String()
179160
cr.Target[corev1.ResourceMemory] = memQuantity.String()
180161

@@ -269,3 +250,162 @@ func (rr *ResourceRecommender) MemoryOOMProtection(oomRecords []oom.OOMRecord, n
269250

270251
return nil
271252
}
253+
254+
// getContainerCpuAndMemoryTsList gets container metrics data
255+
func (rr *ResourceRecommender) getContainerCpuAndMemoryTsList(ctx *framework.RecommendationContext,
256+
predictor prediction.Interface,
257+
caller string,
258+
namespace, containerName string) ([]*common.TimeSeries, []*common.TimeSeries, error) {
259+
260+
// cpu
261+
cpuNamer := metricnaming.ResourceToContainerMetricNamer(namespace,
262+
ctx.Recommendation.Spec.TargetRef.APIVersion,
263+
ctx.Recommendation.Spec.TargetRef.Kind,
264+
ctx.Recommendation.Spec.TargetRef.Name,
265+
containerName,
266+
corev1.ResourceCPU,
267+
caller)
268+
269+
cpuTs, err := utils.QueryPredictedValuesOnce(ctx.Recommendation, predictor, caller, rr.makeCpuConfig(), cpuNamer)
270+
if err != nil {
271+
return nil, nil, err
272+
}
273+
274+
// memory
275+
memNamer := metricnaming.ResourceToContainerMetricNamer(namespace,
276+
ctx.Recommendation.Spec.TargetRef.APIVersion,
277+
ctx.Recommendation.Spec.TargetRef.Kind,
278+
ctx.Recommendation.Spec.TargetRef.Name,
279+
containerName,
280+
corev1.ResourceMemory,
281+
caller)
282+
283+
memTs, err := utils.QueryPredictedValuesOnce(ctx.Recommendation, predictor, caller, rr.makeMemConfig(), memNamer)
284+
if err != nil {
285+
return nil, nil, err
286+
}
287+
288+
return cpuTs, memTs, nil
289+
}
290+
291+
func (rr *ResourceRecommender) getPodCpuAndMemoryTsList(ctx *framework.RecommendationContext, namespace, caller string, predictor prediction.Interface) ([]*common.TimeSeries, []*common.TimeSeries, bool, error) {
292+
var errs []error
293+
cpuOK, memOK := true, true
294+
295+
// cpu
296+
cpuMetricNamer := metricnaming.ResourceToPodMetricNamer(namespace,
297+
ctx.Pods[0].Name,
298+
corev1.ResourceCPU,
299+
caller)
300+
cpuTsList, err := utils.QueryPredictedValuesOnce(ctx.Recommendation, predictor, caller, rr.makeCpuConfig(), cpuMetricNamer)
301+
if err != nil {
302+
cpuOK = false
303+
errs = append(errs, err)
304+
}
305+
306+
// memory
307+
memoryMetricNamer := metricnaming.ResourceToPodMetricNamer(namespace,
308+
ctx.Pods[0].Name,
309+
corev1.ResourceMemory,
310+
caller)
311+
memTsList, err := utils.QueryPredictedValuesOnce(ctx.Recommendation, predictor, caller, rr.makeMemConfig(), memoryMetricNamer)
312+
if err != nil {
313+
memOK = false
314+
errs = append(errs, err)
315+
}
316+
317+
if !cpuOK && !memOK {
318+
return nil, nil, false, errors.NewAggregate(errs)
319+
}
320+
321+
return cpuTsList, memTsList, true, errors.NewAggregate(errs)
322+
}
323+
324+
// recommendCpuAndMemResources recommends CPU and memory resources based on historical monitoring data, OOM records, and resource specification normalization
325+
func (rr *ResourceRecommender) recommendCpuAndMemResources(ctx *framework.RecommendationContext,
326+
cpuTsList []*common.TimeSeries,
327+
memTsList []*common.TimeSeries,
328+
oomRecords []oom.OOMRecord,
329+
namespace, workloadName, containerName string) (*resource.Quantity, *resource.Quantity, error) {
330+
331+
var errs []error
332+
cpuOK, memOK := true, true
333+
334+
// cpu
335+
cpuQuantity, err := rr.recommendSingleResource(ctx, cpuTsList, rr.CpuModelHistoryLength, corev1.ResourceCPU, containerName)
336+
if err != nil {
337+
cpuOK = false
338+
errs = append(errs, err)
339+
}
340+
341+
// memory
342+
memQuantity, err := rr.recommendSingleResource(ctx, memTsList, rr.MemHistoryLength, corev1.ResourceMemory, containerName)
343+
if err != nil {
344+
memOK = false
345+
errs = append(errs, err)
346+
}
347+
348+
if !cpuOK && !memOK {
349+
return nil, nil, errors.NewAggregate(errs)
350+
}
351+
352+
// adjust memory recommendations by analyzing historical OOM events
353+
if memOK && rr.OOMProtection {
354+
if oomMem := rr.MemoryOOMProtection(oomRecords, namespace, workloadName, containerName); oomMem != nil {
355+
if !oomMem.IsZero() && oomMem.Cmp(*memQuantity) > 0 {
356+
klog.Infof("%s: %s using oomProtect Memory %s", ctx.String(), containerName, oomMem.String())
357+
memQuantity = oomMem
358+
}
359+
}
360+
}
361+
362+
// standardize resource recommendations to predefined specifications
363+
if rr.Specification {
364+
if cpuOK && memOK {
365+
normalizedCpu, normalizedMem := GetNormalizedResource(cpuQuantity, memQuantity, rr.SpecificationConfigs)
366+
klog.Infof("GetNormalizedResource currentCpu %s normalizedCpu %s currentMem %s normalizedMem %s",
367+
cpuQuantity.String(), normalizedCpu.String(), memQuantity.String(), normalizedMem.String())
368+
if normalizedCpu.Value() > 0 && normalizedMem.Value() > 0 {
369+
cpuQuantity = &normalizedCpu
370+
memQuantity = &normalizedMem
371+
}
372+
} else {
373+
return nil, nil, fmt.Errorf("cpu or memory recommendation failed, cannot standardize resource recommendations to predefined specifications")
374+
}
375+
}
376+
377+
return cpuQuantity, memQuantity, nil
378+
}
379+
380+
func (rr *ResourceRecommender) recommendSingleResource(ctx *framework.RecommendationContext,
381+
tsList []*common.TimeSeries,
382+
historyLength string,
383+
resourceType corev1.ResourceName,
384+
containerName string) (*resource.Quantity, error) {
385+
386+
if len(tsList) == 0 || len(tsList[0].Samples) == 0 {
387+
return nil, fmt.Errorf("no metrics data for %s", resourceType)
388+
}
389+
390+
if rr.HistoryCompletionCheck {
391+
completion, existDays, err := utils.DetectTimestampCompletion(tsList, historyLength, time.Now())
392+
if !completion || err != nil {
393+
return nil, fmt.Errorf("%s timestamps not completed: expect %s actual %d days", resourceType, historyLength, existDays)
394+
}
395+
}
396+
397+
value := tsList[0].Samples[0].Value
398+
var quantity *resource.Quantity
399+
if resourceType == corev1.ResourceCPU {
400+
value *= 1000
401+
quantity = resource.NewQuantity(int64(value), resource.DecimalSI)
402+
} else if resourceType == corev1.ResourceMemory {
403+
quantity = resource.NewQuantity(int64(value), resource.BinarySI)
404+
if value <= 0 {
405+
return nil, fmt.Errorf("invalid %s value: %f", resourceType, value)
406+
}
407+
}
408+
409+
klog.Infof("%s: %s recommended %s %s", ctx.String(), containerName, resourceType, quantity.String())
410+
return quantity, nil
411+
}

pkg/utils/expression_prom_default.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ const (
2929
NodeMemUsageUtilizationExprTemplate = `sum(label_replace(container_memory_usage_bytes{instance="%s", namespace!="",container!="POD", container!="",image!=""EXTENSION_LABELS_HOLDER}, "node", "$1", "instance", "(^[^:]+)") * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="memory", unit="byte"EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) `
3030

3131
// PodCpuUsageExprTemplate is used to query pod cpu usage by promql, param is namespace,pod, duration str
32-
PodCpuUsageExprTemplate = `sum(irate(container_cpu_usage_seconds_total{container!="POD",namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER}[%s]))`
32+
PodCpuUsageExprTemplate = `sum(irate(pod_cpu_seconds_total{mode!~"idle|iowait",namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER}[%s]))`
3333
// PodMemUsageExprTemplate is used to query pod cpu usage by promql, param is namespace,pod
34-
PodMemUsageExprTemplate = `sum(container_memory_working_set_bytes{container!="POD",namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER})`
34+
PodMemUsageExprTemplate = `pod_memory_MemTotal_bytes{namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER} - pod_memory_MemFree_bytes{namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER}`
3535

3636
// ContainerCpuUsageExprTemplate is used to query container cpu usage by promql, param is namespace,pod,container duration str
3737
ContainerCpuUsageExprTemplate = `irate(container_cpu_usage_seconds_total{container!="POD",namespace="%s",pod=~"%s",container="%s"EXTENSION_LABELS_HOLDER}[%s])`
@@ -117,7 +117,7 @@ func GetPodCpuUsageExpression(namespace string, name string) string {
117117
}
118118

119119
func GetPodMemUsageExpression(namespace string, name string) string {
120-
return fmtSprintfInternal(PodMemUsageExprTemplate, namespace, name)
120+
return fmtSprintfInternal(PodMemUsageExprTemplate, namespace, name, namespace, name)
121121
}
122122

123123
func GetNodeCpuUsageExpression(nodeName string) string {

0 commit comments

Comments
 (0)