Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: proportion metrics accuracy #2297

Merged

Conversation

LY-today
Copy link
Contributor

@LY-today LY-today commented Jun 13, 2022

refer:#2296
What happened:
When there is only one vcjob under a queue, after deleting the vcjob, it will be found that the metric data in the proportion is inaccurate
What you expected to happen:
Metric data is accurate at all times
How to reproduce it (as minimally and precisely as possible):

/*
Copyright 2017 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package proportion

import (
	"github.com/prometheus/client_golang/prometheus/promhttp"
	"io/ioutil"
	"k8s.io/client-go/util/workqueue"
	"net/http"
	"reflect"
	"strconv"
	"strings"
	"testing"
	"time"
	"volcano.sh/volcano/pkg/scheduler/actions/allocate"

	"github.com/agiledragon/gomonkey/v2"
	apiv1 "k8s.io/api/core/v1"
	schedulingv1 "k8s.io/api/scheduling/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/tools/record"

	schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
	"volcano.sh/volcano/cmd/scheduler/app/options"
	"volcano.sh/volcano/pkg/scheduler/api"
	"volcano.sh/volcano/pkg/scheduler/cache"
	"volcano.sh/volcano/pkg/scheduler/conf"
	"volcano.sh/volcano/pkg/scheduler/framework"
	"volcano.sh/volcano/pkg/scheduler/plugins/gang"
	"volcano.sh/volcano/pkg/scheduler/plugins/priority"
	"volcano.sh/volcano/pkg/scheduler/util"
)

func getWorkerAffinity() *apiv1.Affinity {
	return &apiv1.Affinity{
		PodAntiAffinity: &apiv1.PodAntiAffinity{
			RequiredDuringSchedulingIgnoredDuringExecution: []apiv1.PodAffinityTerm{
				{
					LabelSelector: &metav1.LabelSelector{
						MatchExpressions: []metav1.LabelSelectorRequirement{
							{
								Key:      "role",
								Operator: "In",
								Values:   []string{"worker"},
							},
						},
					},
					TopologyKey: "kubernetes.io/hostname",
				},
			},
		},
	}
}

func getLocalMetrics() int {
	var data int

	url := "http://127.0.0.1:8081/metrics"
	method := "GET"

	client := &http.Client{
	}
	req, err := http.NewRequest(method, url, nil)

	if err != nil {
		return data
	}
	req.Header.Add("Authorization", "8cbdb37a-b880-4f2e-844c-e420858ea7eb")

	res, err := client.Do(req)
	if err != nil {
		return data
	}
	defer res.Body.Close()

	body, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return data
	}

	split := strings.Split(string(body), "\n")
	for _, v := range split {
		if !strings.Contains(v, "#") && (strings.Contains(v, "volcano_queue_allocated_memory_bytes") || strings.Contains(v, "volcano_queue_allocated_milli_cpu")) {
			data, _ = strconv.Atoi(strings.Split(v, " ")[1])
		}
	}

	return data
}

func TestProportion(t *testing.T) {
	c := make(chan bool, 1)
	var tmp *cache.SchedulerCache
	patches := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "AddBindTask", func(scCache *cache.SchedulerCache, task *api.TaskInfo) error {
		scCache.Binder.Bind(nil, []*api.TaskInfo{task})
		return nil
	})
	defer patches.Reset()

	framework.RegisterPluginBuilder(PluginName, New)
	framework.RegisterPluginBuilder(gang.PluginName, gang.New)
	framework.RegisterPluginBuilder(priority.PluginName, priority.New)
	options.ServerOpts = options.NewServerOption()
	defer framework.CleanupPluginBuilders()

	// Running pods
	w1 := util.BuildPod("ns1", "worker-1", "", apiv1.PodRunning, util.BuildResourceList("3", "3k"), "pg1", map[string]string{"role": "worker"}, map[string]string{"selector": "worker"})
	w2 := util.BuildPod("ns1", "worker-2", "", apiv1.PodRunning, util.BuildResourceList("5", "5k"), "pg1", map[string]string{"role": "worker"}, map[string]string{})
	w3 := util.BuildPod("ns1", "worker-3", "", apiv1.PodRunning, util.BuildResourceList("4", "4k"), "pg2", map[string]string{"role": "worker"}, map[string]string{})
	w1.Spec.Affinity = getWorkerAffinity()
	w2.Spec.Affinity = getWorkerAffinity()
	w3.Spec.Affinity = getWorkerAffinity()

	// nodes
	n1 := util.BuildNode("node1", util.BuildResourceList("4", "4k"), map[string]string{"selector": "worker"})
	n2 := util.BuildNode("node2", util.BuildResourceList("3", "3k"), map[string]string{})
	n1.Status.Allocatable["pods"] = resource.MustParse("15")
	n2.Status.Allocatable["pods"] = resource.MustParse("15")
	n1.Labels["kubernetes.io/hostname"] = "node1"
	n2.Labels["kubernetes.io/hostname"] = "node2"

	// priority
	p1 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p1"}, Value: 1}
	p2 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p2"}, Value: 2}
	// podgroup
	pg1 := &schedulingv1beta1.PodGroup{
		ObjectMeta: metav1.ObjectMeta{
			Namespace: "ns1",
			Name:      "pg1",
		},
		Spec: schedulingv1beta1.PodGroupSpec{
			Queue:             "q1",
			MinMember:         int32(2),
			PriorityClassName: p2.Name,
		},
	}
	pg2 := &schedulingv1beta1.PodGroup{
		ObjectMeta: metav1.ObjectMeta{
			Namespace: "ns1",
			Name:      "pg2",
		},
		Spec: schedulingv1beta1.PodGroupSpec{
			Queue:             "q1",
			MinMember:         int32(1),
			PriorityClassName: p1.Name,
		},
	}
	// queue
	queue1 := &schedulingv1beta1.Queue{
		ObjectMeta: metav1.ObjectMeta{
			Name: "q1",
		},
	}

	// tests
	tests := []struct {
		name     string
		pods     []*apiv1.Pod
		nodes    []*apiv1.Node
		pcs      []*schedulingv1.PriorityClass
		pgs      []*schedulingv1beta1.PodGroup
		expected map[string]string
	}{
		{
			name:  "pod-deallocate",
			pods:  []*apiv1.Pod{w1, w2, w3},
			nodes: []*apiv1.Node{n1, n2},
			pcs:   []*schedulingv1.PriorityClass{p1, p2},
			pgs:   []*schedulingv1beta1.PodGroup{pg1, pg2},
			expected: map[string]string{ // podKey -> node
				"ns1/worker-3": "node1",
			},
		},
	}

	for _, test := range tests {
		// initialize schedulerCache
		binder := &util.FakeBinder{
			Binds:   map[string]string{},
			Channel: make(chan string),
		}
		recorder := record.NewFakeRecorder(100)
		go func() {
			for {
				event := <-recorder.Events
				t.Logf("%s: [Event] %s", test.name, event)
			}
		}()
		schedulerCache := &cache.SchedulerCache{
			Nodes:           make(map[string]*api.NodeInfo),
			Jobs:            make(map[api.JobID]*api.JobInfo),
			PriorityClasses: make(map[string]*schedulingv1.PriorityClass),
			Queues:          make(map[api.QueueID]*api.QueueInfo),
			Binder:          binder,
			StatusUpdater:   &util.FakeStatusUpdater{},
			VolumeBinder:    &util.FakeVolumeBinder{},
			Recorder:        recorder,
		}

		// Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.
		// ps: I have also tried other methods without modifying deletedJobs, such as switching to cached packages for testing, but it will introduce new problems of package conflicts. I also tried to use Cache.New, because there is no mechanism similar to fake, and the problem of failure to create the default queue will also occur. Finally, I had no choice but to use this method for testing.
		schedulerCache.DeletedJobs = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())

		for _, node := range test.nodes {
			schedulerCache.AddNode(node)
		}
		for _, pod := range test.pods {
			schedulerCache.AddPod(pod)
		}
		for _, pc := range test.pcs {
			schedulerCache.PriorityClasses[pc.Name] = pc
		}
		for _, pg := range test.pgs {
			pg.Status = schedulingv1beta1.PodGroupStatus{
				Phase: schedulingv1beta1.PodGroupInqueue,
			}
			schedulerCache.AddPodGroupV1beta1(pg)
		}
		schedulerCache.AddQueueV1beta1(queue1)
		// session
		trueValue := true

		num := 1
		// proportion
		go func() {
			for {
				select {
				default:
					ssn := framework.OpenSession(schedulerCache, []conf.Tier{
						{
							Plugins: []conf.PluginOption{
								{
									Name:             PluginName,
									EnabledPredicate: &trueValue,
								},
								{
									Name:                gang.PluginName,
									EnabledJobReady:     &trueValue,
									EnabledJobPipelined: &trueValue,
								},
								{
									Name:            priority.PluginName,
									EnabledJobOrder: &trueValue,
								},
							},
						},
					}, nil)

					allocator := allocate.New()
					allocator.Execute(ssn)
					framework.CloseSession(ssn)
					time.Sleep(time.Second * 3)
					if num == 1 {
						metrics := getLocalMetrics()
						if metrics == 12000 {
							t.Logf("init queue_allocated metrics is ok,%v", metrics)
						}
						schedulerCache.DeletePodGroupV1beta1(pg1)
					} else if num == 2 {
						metrics := getLocalMetrics()
						if metrics == 4000 {
							t.Logf("after delete vcjob pg1, queue_allocated metrics is ok,%v", metrics)
						}
						schedulerCache.DeletePodGroupV1beta1(pg2)
					} else {
						metrics := getLocalMetrics()
						if metrics != 0 {
							t.Errorf("after delete vcjob pg2, queue_allocated metrics is fail,%v", metrics)
							c <- false
							return
						} else {
							t.Logf("after delete vcjob pg2, queue_allocated metrics is ok,%v", metrics)
							c <- true
						}
					}
					num++
				}
			}
		}()

		go func() {
			http.Handle("/metrics", promhttp.Handler())
			err := http.ListenAndServe(":8081", nil)
			if err != nil {
				t.Errorf("ListenAndServe() err = %v", err.Error())
			}
		}()

		for {
			select {
			case res := <-c:
				if !res {
					t.Error("TestProportion failed")
				} else {
					t.Log("TestProportion successful")
				}
				return
			}

		}
	}
}

Anything else we need to know?:
Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.
ps: I have also tried other methods without modifying deletedJobs, such as switching to cached packages for testing, but it will introduce new problems of package conflicts. I also tried to use Cache.New, because there is no mechanism similar to fake, and the problem of failure to create the default queue will also occur. Finally, I had no choice but to use this method for testing.
Environment:

Volcano Version:
master:5cfe62d01d27a5b6482ae6a9458b9451b7e5bc3e
Kubernetes version (use kubectl version):
v1.18.2

@volcano-sh-bot
Copy link
Contributor

Welcome @LY-today!

It looks like this is your first PR to volcano-sh/volcano 馃帀.

Thank you, and welcome to Volcano. 😃

@volcano-sh-bot volcano-sh-bot added the size/L Denotes a PR that changes 100-499 lines, ignoring generated files. label Jun 13, 2022
@LY-today LY-today force-pushed the proportion-metric-accuracy branch 2 times, most recently from 88f6ccb to 33c6d5e Compare June 13, 2022 09:47
@LY-today
Copy link
Contributor Author

/assign @shinytang6

@LY-today
Copy link
Contributor Author

/assign @shinytang6

@mangk
Copy link

mangk commented Jun 13, 2022

I also encountered this problem. The deletedJobs of the cache in volcano seems to be not very friendly to testing. hope to see changes in the future. Seeing your pr, I finally know why my prometheus is inaccurate, adopted locally

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 14, 2022

@LY-today Hey, thanks for your contribution! I've reivewed the description and the fix, and there are some questions. May I know that why

Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.

Hey, @LY-today Thanks for your contribution! I've reviewed the scenario you mentioned and the fix. May I know why it is hard to reproduced if not publish the api?

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 14, 2022

Besides, pls rebase the latest master code to resolve the CI Code Verify.

@LY-today
Copy link
Contributor Author

@LY-today Hey, thanks for your contribution! I've reivewed the description and the fix, and there are some questions. May I know that why

Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.

Hey, @LY-today Thanks for your contribution! I've reviewed the scenario you mentioned and the fix. May I know why it is hard to reproduced if not publish the api?
@Thor-wl
To keep reproducibility and precision to a minimum, I try not to modify deleted jobs. Example: 1. Cached packages test this problem, the result is a package conflict. 2. The schedulerCache is created in the Cache.New method, but the default queue creation fails. ps: The above problems are all due to the reproduction of the scene, and the vcjobs under the queue need to be deleted to 0 to appear. However, the deletion operation depends on the initialization of deletedJobs. However, the initialization of deletedJobs is not very friendly, and encountered many problems with the previous design, so finally modified deletedJobs to be public. I don't know if I explained it clearly

@LY-today
Copy link
Contributor Author

Besides, pls rebase the latest master code to resolve the CI Code Verify.

Thanks for the tip,done

@LY-today LY-today force-pushed the proportion-metric-accuracy branch from 75b19e1 to 949eaa6 Compare June 14, 2022 06:41
@volcano-sh-bot volcano-sh-bot added size/XXL Denotes a PR that changes 1000+ lines, ignoring generated files. size/L Denotes a PR that changes 100-499 lines, ignoring generated files. and removed size/L Denotes a PR that changes 100-499 lines, ignoring generated files. size/XXL Denotes a PR that changes 1000+ lines, ignoring generated files. labels Jun 14, 2022
@LY-today LY-today force-pushed the proportion-metric-accuracy branch 2 times, most recently from d369118 to 70ba674 Compare June 14, 2022 09:54
@LY-today
Copy link
Contributor Author

LY-today commented Jun 16, 2022

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

@LY-today
Copy link
Contributor Author

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

@LY-today
Copy link
Contributor Author

/assign @shinytang6

@hwdef
Copy link
Member

hwdef commented Jun 21, 2022

Please squash your commit.

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 21, 2022

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 21, 2022

/lgtm

@volcano-sh-bot volcano-sh-bot added the lgtm Indicates that a PR is ready to be merged. label Jun 21, 2022
@LY-today
Copy link
Contributor Author

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

That's fine, but I'm having an unexpected problem. I want to merge the previous commits through rebase, but some of your commits have caused conflicts in my local rebase.

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 21, 2022

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

That's fine, but I'm having an unexpected problem. I want to merge the previous commits through rebase, but some of your commits have caused conflicts in my local rebase.

Perhaps you can rebase all your commits first, and then execute git pull --rebase origin master to merge with the latest code and resolve the conflicts.

Signed-off-by: LY-today <724102053@qq.com>
@LY-today LY-today force-pushed the proportion-metric-accuracy branch from 70ba674 to 428ee73 Compare June 21, 2022 09:45
@volcano-sh-bot volcano-sh-bot removed the lgtm Indicates that a PR is ready to be merged. label Jun 21, 2022
@LY-today
Copy link
Contributor Author

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

That's fine, but I'm having an unexpected problem. I want to merge the previous commits through rebase, but some of your commits have caused conflicts in my local rebase.

Perhaps you can rebase all your commits first, and then execute git pull --rebase origin master to merge with the latest code and resolve the conflicts.

Thanks for the tip, I have rebase all the commits and push them again. After the test is passed, you can rejoin /lgtm

@LY-today
Copy link
Contributor Author

@Thor-wl您好,潮汐流程似乎卡住了,我需要添加哪些信息才能推动流程前进?

这么晚才回复很抱歉。我会尽快完成审查。

没关系,但我遇到了一个意想不到的问题。我想通过 rebase 合并之前的提交,但是你的一些提交在我的本地 rebase 中引起了冲突。

或许您可以先对所有提交进行 rebase,然后执行git pull --rebase origin master以与最新代码合并并解决冲突。

感谢您的提示,我已重新设置所有提交并再次推送它们。测试通过后可以重新加入/lgtm

@Thor-wl Looks like the test is done
@hwdef done

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 22, 2022

/lgtm

@volcano-sh-bot volcano-sh-bot added the lgtm Indicates that a PR is ready to be merged. label Jun 22, 2022
@LY-today
Copy link
Contributor Author

LY-today commented Jun 23, 2022

@Thor-wl hi, I would like to know, do I still need to make some changes?

@Thor-wl
Copy link
Contributor

Thor-wl commented Jun 24, 2022

@Thor-wl hi, I would like to know, do I still need to make some changes?

It's generally ok to me. And I've marked lgtm. Looking forward to other reviewers' adivce.

@LY-today
Copy link
Contributor Author

@Thor-wl hi, I would like to know, do I still need to make some changes?

It's generally ok to me. And I've marked lgtm. Looking forward to other reviewers' adivce.

Understood, thanks

@LY-today
Copy link
Contributor Author

LY-today commented Jun 24, 2022

@alcorj-mizar @zen-xu @hwdef If you have time, please take a look at this pr. If there are deficiencies, I can improve it as soon as possible. If there is no problem, can we proceed to the next step? Because this problem is affecting our resource monitoring system, thank you three reviewers.

@Thor-wl Thor-wl requested review from shinytang6, qiankunli, Thor-wl and william-wang and removed request for zen-xu and alcorj-mizar June 24, 2022 02:12
Copy link
Member

@william-wang william-wang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/approve

@volcano-sh-bot
Copy link
Contributor

[APPROVALNOTIFIER] This PR is APPROVED

This pull-request has been approved by: william-wang

The full list of commands accepted by this bot can be found here.

The pull request process is described here

Needs approval from an approver in each of these files:

Approvers can indicate their approval by writing /approve in a comment
Approvers can cancel approval by writing /approve cancel in a comment

@volcano-sh-bot volcano-sh-bot added the approved Indicates a PR has been approved by an approver from all required OWNERS files. label Jun 24, 2022
@volcano-sh-bot volcano-sh-bot merged commit 40b7fa7 into volcano-sh:master Jun 24, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
approved Indicates a PR has been approved by an approver from all required OWNERS files. lgtm Indicates that a PR is ready to be merged. size/L Denotes a PR that changes 100-499 lines, ignoring generated files.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

7 participants