generated from kubernetes/kubernetes-template-project
-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4151402
commit 64e4c75
Showing
3 changed files
with
199 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
/* | ||
Copyright 2023 The Kubernetes Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package controllers | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
|
||
batchv1 "k8s.io/api/batch/v1" | ||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/runtime" | ||
"k8s.io/apimachinery/pkg/types" | ||
"k8s.io/client-go/tools/record" | ||
"k8s.io/klog/v2" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/jobset/pkg/util/collections" | ||
|
||
jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" | ||
) | ||
|
||
const ( | ||
podOwnerKey = ".metadata.controller" | ||
nodePoolKey = "cloud.google.com/gke-nodepool" | ||
) | ||
|
||
var exclusiveGateConditionType = corev1.PodConditionType(jobset.ExclusiveGate) | ||
|
||
// PodReconciler reconciles a JobSet object | ||
type PodReconciler struct { | ||
client.Client | ||
Scheme *runtime.Scheme | ||
Record record.EventRecorder | ||
} | ||
|
||
func NewPodReconciler(client client.Client, scheme *runtime.Scheme, record record.EventRecorder) *PodReconciler { | ||
return &PodReconciler{Client: client, Scheme: scheme, Record: record} | ||
} | ||
|
||
// +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch | ||
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete | ||
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete | ||
// Reconcile is part of the main kubernetes reconciliation loop which aims to | ||
// move the current state of the cluster closer to the desired state. | ||
func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { | ||
// Get JobSet from apiserver. | ||
var pod corev1.Pod | ||
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { | ||
// we'll ignore not-found errors, since there is nothing we can do here. | ||
return ctrl.Result{}, client.IgnoreNotFound(err) | ||
} | ||
|
||
// TODO: delete logging once we confirmed it works as it will be too noisy. | ||
log := ctrl.LoggerFrom(ctx).WithValues("pod", klog.KObj(&pod)) | ||
ctx = ctrl.LoggerInto(ctx, log) | ||
log.V(2).Info("Reconciling Pod") | ||
|
||
// Check if this is the "leader" pod (completion index 0) and if it has been scheduled. | ||
if pod.Annotations[batchv1.JobCompletionIndexAnnotation] == "0" && pod.Spec.NodeName != "" { | ||
nodePool, err := r.nodePoolFromNode(ctx, pod.Spec.NodeName, pod.Namespace) | ||
if err != nil { | ||
return ctrl.Result{}, err | ||
} | ||
if err := r.assignGatedPodsToNodePool(ctx, &pod, nodePool); err != nil { | ||
return ctrl.Result{}, err | ||
} | ||
} | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// SetupWithManager sets up the controller with the Manager. | ||
func (r *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { | ||
return ctrl.NewControllerManagedBy(mgr). | ||
For(&corev1.Pod{}). | ||
Complete(r) | ||
} | ||
|
||
func SetupPodReconcilerIndexes(ctx context.Context, indexer client.FieldIndexer) error { | ||
return indexer.IndexField(ctx, &corev1.Pod{}, podOwnerKey, func(obj client.Object) []string { | ||
o := obj.(*corev1.Pod) | ||
owner := metav1.GetControllerOf(o) | ||
if owner == nil { | ||
return nil | ||
} | ||
// ...make sure it's a Pod owned by a Job... | ||
if owner.Kind != "Job" || owner.APIVersion != batchv1.SchemeGroupVersion.String() { | ||
return nil | ||
} | ||
return []string{owner.Name} | ||
}) | ||
} | ||
|
||
// nodePoolFromNode gets the node pool name that the given node is part of. | ||
func (r *PodReconciler) nodePoolFromNode(ctx context.Context, nodeName, ns string) (string, error) { | ||
log := ctrl.LoggerFrom(ctx) | ||
|
||
var node corev1.Node | ||
if err := r.Get(ctx, types.NamespacedName{Name: nodeName, Namespace: ns}, &node); err != nil { | ||
// we'll ignore not-found errors, since there is nothing we can do here. | ||
log.Error(err, fmt.Sprintf("getting node %s", nodeName)) | ||
return "", client.IgnoreNotFound(err) | ||
} | ||
nodePool, exists := node.Labels[nodePoolKey] | ||
if !exists { | ||
log.V(2).Error(nil, fmt.Sprintf("missing node pool label: %s", nodePoolKey)) | ||
} | ||
return nodePool, nil | ||
} | ||
|
||
func (r *PodReconciler) assignGatedPodsToNodePool(ctx context.Context, leaderPod *corev1.Pod, nodePool string) error { | ||
// Get name of job that owns the leader pod. | ||
jobName, err := jobNameFromPod(leaderPod) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// Get all pods owned by this job. | ||
var podList corev1.PodList | ||
if err := r.List(ctx, &podList, client.InNamespace(leaderPod.Namespace), client.MatchingFields{podOwnerKey: jobName}); err != nil { | ||
return err | ||
} | ||
|
||
// For every pod besides pod index 0, add a node affinity for this node pool and remove the scheduling gate. | ||
for _, pod := range podList.Items { | ||
if pod.Annotations[batchv1.JobCompletionIndexAnnotation] == "0" { | ||
continue | ||
} | ||
addNodePoolNodeSelector(&pod, nodePool) | ||
removeSchedulingGate(&pod) | ||
} | ||
return nil | ||
} | ||
|
||
func jobNameFromPod(pod *corev1.Pod) (string, error) { | ||
var jobName string | ||
for _, ownerReference := range pod.OwnerReferences { | ||
if ownerReference.Kind == "Job" { | ||
jobName = ownerReference.Name | ||
} | ||
} | ||
if jobName == "" { | ||
return "", fmt.Errorf("unable to find name of job owning pod %s", pod.Name) | ||
} | ||
return jobName, nil | ||
} | ||
|
||
func addNodePoolNodeSelector(pod *corev1.Pod, nodePool string) { | ||
// Add node selector for node pool label. | ||
if pod.Spec.NodeSelector == nil { | ||
pod.Spec.NodeSelector = make(map[string]string) | ||
} | ||
pod.Spec.NodeSelector[nodePoolKey] = nodePool | ||
} | ||
|
||
func removeSchedulingGate(pod *corev1.Pod) { | ||
// Remove scheduling gate. | ||
removeIdx := -1 | ||
for i, gate := range pod.Spec.ReadinessGates { | ||
if gate.ConditionType == exclusiveGateConditionType { | ||
removeIdx = i | ||
break | ||
} | ||
} | ||
// If gate already removed, continue. | ||
if removeIdx == -1 { | ||
return | ||
} | ||
pod.Spec.ReadinessGates = collections.Remove(pod.Spec.ReadinessGates, removeIdx) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters