Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add network-topology-aware plugin and hyperNode score callback #3894

Open
wants to merge 1 commit into
base: network-topology
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 17 additions & 20 deletions pkg/scheduler/actions/allocate/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package allocate

import (
"sort"
"time"

"k8s.io/klog/v2"
Expand All @@ -34,7 +33,6 @@ type Action struct {
session *framework.Session
// configured flag for error cache
enablePredicateErrorCache bool
hyperNodesTiers []int

// hyperNodeScoresByJob stores job total score for all available hyperNodes, this is used for accumulate
// all nodes' scores in each available hyperNode only when job has hard network topology constrains
Expand All @@ -45,7 +43,6 @@ type Action struct {
func New() *Action {
return &Action{
enablePredicateErrorCache: true, // default to enable it
hyperNodesTiers: []int{},
hyperNodeScoresByJob: make(map[string]map[string]float64),
}
}
Expand All @@ -61,26 +58,11 @@ func (alloc *Action) parseArguments(ssn *framework.Session) {
arguments.GetBool(&alloc.enablePredicateErrorCache, conf.EnablePredicateErrCacheKey)
}

func (alloc *Action) parseHyperNodesTiers(ssn *framework.Session) {
if ssn.HyperNodesSetByTier == nil || len(ssn.HyperNodesSetByTier) == 0 {
return
}

// sort to guarantee the traverse order is from down to top.
var tiers []int
for tier := range ssn.HyperNodesSetByTier {
tiers = append(tiers, tier)
}
sort.Ints(tiers)
alloc.hyperNodesTiers = tiers
}

func (alloc *Action) Execute(ssn *framework.Session) {
klog.V(5).Infof("Enter Allocate ...")
defer klog.V(5).Infof("Leaving Allocate ...")

alloc.parseArguments(ssn)
alloc.parseHyperNodesTiers(ssn)

// the allocation for pod may have many stages
// 1. pick a queue named Q (using ssn.QueueOrderFn)
Expand Down Expand Up @@ -241,7 +223,7 @@ func (alloc *Action) allocateResourceForTasksWithTopology(tasks *util.PriorityQu
jobAllocatedHyperNode := job.PodGroup.Annotations[api.JobAllocatedHyperNode]

// Find a suitable hyperNode in one tier from down to top everytime to ensure that the selected hyperNode spans the least tier.
for _, tier := range alloc.hyperNodesTiers {
for _, tier := range ssn.HyperNodesTiers {
if tier > highestAllowedTier {
klog.V(4).ErrorS(nil, "Skip search for higher tier cause highest allowed tier reached", "jobName", job.UID, "highestAllowedTier", highestAllowedTier, "tier", tier)
break
Expand Down Expand Up @@ -375,6 +357,8 @@ func (alloc *Action) allocateResourcesForTasks(tasks *util.PriorityQueue, job *a
ssn := alloc.session
stmt := framework.NewStatement(ssn)
ph := util.NewPredicateHelper()
jobAllocatedHyperNode := job.PodGroup.Annotations[api.JobAllocatedHyperNode]
jobAllocatedNewHyperNode := jobAllocatedHyperNode

for !tasks.Empty() {
task := tasks.Pop().(*api.TaskInfo)
Expand Down Expand Up @@ -414,11 +398,21 @@ func (alloc *Action) allocateResourcesForTasks(tasks *util.PriorityQueue, job *a
}
}

task.JobAllocatedHyperNode = jobAllocatedNewHyperNode
bestNode, highestScore := alloc.prioritizeNodes(ssn, task, predicateNodes)
if bestNode == nil {
continue
}

if hyperNode == "" {
Copy link
Preview

Copilot AI Mar 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable 'hyperNode' is used without being declared or initialized. Please declare it before this block, for example initializing it as an empty string.

Copilot is powered by AI, so mistakes are possible. Review output carefully before use.

Positive Feedback
Negative Feedback

Provide additional feedback

Please help us improve GitHub Copilot by sharing more details about this comment.

Please select one or more of the options
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 'hyperNode' is a function parameter.

hyperNode = util.FindHyperNodeOfNode(bestNode.Name, ssn.RealNodesList, ssn.HyperNodesTiers, ssn.HyperNodesSetByTier)
if hyperNode != "" {
if jobAllocatedNewHyperNode == "" {
jobAllocatedNewHyperNode = hyperNode
} else {
jobAllocatedNewHyperNode = ssn.HyperNodes.GetLCAHyperNode(hyperNode, jobAllocatedNewHyperNode)
}
}
}
alloc.sumNodeScoresInHyperNode(string(job.UID), hyperNode, highestScore)
alloc.allocateResourcesForTask(stmt, task, bestNode, job)

Expand All @@ -429,6 +423,9 @@ func (alloc *Action) allocateResourcesForTasks(tasks *util.PriorityQueue, job *a

if ssn.JobReady(job) {
klog.V(3).InfoS("Job ready, return statement", "jobName", job.UID)
if jobAllocatedNewHyperNode != "" && jobAllocatedNewHyperNode != jobAllocatedHyperNode {
job.PodGroup.GetAnnotations()[api.JobAllocatedHyperNode] = jobAllocatedNewHyperNode
}
return stmt
} else {
if !ssn.JobPipelined(job) {
Expand Down
Loading
Loading