kubernetes-sigs · k8s-ci-robot · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/Makefile b/Makefile
@@ -175,6 +175,16 @@ test-e2e: kustomize manifests fmt vet envtest ginkgo kind-image-build
 test-e2e-cert-manager: kustomize manifests fmt vet envtest ginkgo kind-image-build
 	USE_CERT_MANAGER=true CERT_MANAGER_VERSION=$(CERT_MANAGER_VERSION) E2E_KIND_VERSION=$(E2E_KIND_VERSION) KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) KIND=$(KIND) KUBECTL=$(KUBECTL) KUSTOMIZE=$(KUSTOMIZE) GINKGO=$(GINKGO) USE_EXISTING_CLUSTER=$(USE_EXISTING_CLUSTER) IMAGE_TAG=$(IMG) ARTIFACTS=$(ARTIFACTS) ./hack/e2e-test.sh
 
+# Gang scheduling E2E tests with different schedulers
+VOLCANO_VERSION ?= v1.12.1
+
+.PHONY: test-e2e-gang-scheduling
+test-e2e-gang-scheduling: test-e2e-gang-scheduling-volcano ## Run all gang scheduling E2E tests
+
+.PHONY: test-e2e-gang-scheduling-volcano
+test-e2e-gang-scheduling-volcano: kustomize manifests fmt vet envtest ginkgo kind-image-build ## Run gang scheduling E2E tests with Volcano
+	SCHEDULER_PROVIDER=volcano VOLCANO_VERSION=$(VOLCANO_VERSION) E2E_KIND_VERSION=$(E2E_KIND_VERSION) KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) KIND=$(KIND) KUBECTL=$(KUBECTL) KUSTOMIZE=$(KUSTOMIZE) GINKGO=$(GINKGO) USE_EXISTING_CLUSTER=$(USE_EXISTING_CLUSTER) IMAGE_TAG=$(IMG) ARTIFACTS=$(ARTIFACTS) ./hack/e2e-test.sh
+
 .PHONY: lint
 lint: golangci-lint ## Run golangci-lint linter & yamllint
 	$(GOLANGCI_LINT) run --timeout 15m0s

diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ Read the [documentation](https://lws.sigs.k8s.io/docs/) or watch the LWS-related
 - **Group of Pods as a unit:** Supports a tightly managed group of pods that represent a “super pod”
   - **Unique pod identity:** Each pod in the group has a unique index from 0 to n-1.
   - **Parallel creation:** Pods in the group will have the same lifecycle and be created in parallel.
+  - **Gang Scheduling:** Each replica with a group of pods can be scheduled in an all-or-nothing manner (Alpha level, API may change in the future).
 - **Dual-template, one for leader and one for the workers:** A replica is a group of a single leader and a set of workers, and allow to specify a template for the workers and optionally use a second one for the leader pod.
 - **Multiple groups with identical specifications:** Supports creating multiple “replicas” of the above mentioned group. Each group is a single unit for rolling update, scaling, and maps to a single exclusive topology for placement.
 - **A scale subresource:** A scale endpoint is exposed for HPA to dynamically scale the number replicas (aka number of groups)

diff --git a/api/config/v1alpha1/configuration_types.go b/api/config/v1alpha1/configuration_types.go
@@ -34,6 +34,9 @@ type Configuration struct {
 	// InternalCertManagerment is configuration for internalCertManagerment
 	InternalCertManagement *InternalCertManagement `json:"internalCertManagement,omitempty"`
 
+	// GangSchedulingManagement is configuration for gang scheduling management.
+	GangSchedulingManagement *GangSchedulingManagement `json:"gangSchedulingManagement,omitempty"`
+
 	// ClientConnection is configuration of the client while connecting to API Server
 	ClientConnection *ClientConnection `json:"clientConnection,omitempty"`
 }
@@ -128,3 +131,9 @@ type ClientConnection struct {
 	// Burst allows extra queries to accumulate when a client is exceeding its rate.
 	Burst *int32 `json:"burst,omitempty"`
 }
+
+// GangSchedulingManagement defines the gang scheduling management configuration.
+type GangSchedulingManagement struct {
+	// SchedulerProvider is the name of the scheduler that provides gang-scheduling capabilities.
+	SchedulerProvider *string `json:"schedulerProvider,omitempty"`
+}
diff --git a/api/config/v1alpha1/zz_generated.deepcopy.go b/api/config/v1alpha1/zz_generated.deepcopy.go
diff --git a/charts/lws/README.md b/charts/lws/README.md
@@ -65,24 +65,25 @@ for more information on installing LWS with metrics using our Helm chart.
 
 The following table lists the configurable parameters of the LWS chart and their default values.
 
-| Parameter                                   | Description                                    | Default                              |
-|---------------------------------------------|------------------------------------------------|--------------------------------------|
-| `nameOverride`                              | nameOverride                                   | ``                                   |
-| `fullnameOverride`                          | fullnameOverride                               | ``                                   |
-| `enablePrometheus`                          | enable Prometheus                              | `false`                              |
-| `enableCertManager`                         | enable CertManager                             | `false`                              |
-| `imagePullSecrets`                          | Image pull secrets                             | `[]`                                 |
-| `image.manager.repository`                  | Repository for manager image                   | `us-central1-docker.pkg.dev/k8s-staging-images/lws`         |
-| `image.manager.tag`                         | Tag for manager image                          | `main`                               |
-| `image.manager.pullPolicy`                  | Pull policy for manager image                  | `IfNotPresent`                       |
-| `podAnnotations`                            | Annotations for pods                           | `{}`                                 |
-| `podSecurityContext.runAsNonRoot`           | Run pod as non-root user                       | `true`                               |
-| `securityContext.allowPrivilegeEscalation`  | Allow privilege escalation in security context | `false`                              |
-| `securityContext.capabilities.drop`         | Drop all capabilities in security context      | `["ALL"]`                            |
-| `service.type`                              | Type of lws controller service                 | `ClusterIP`                          |
-| `service.port`                              | Lws controller service port                    | `9443`                               |
-| `resources.requests.cpu`                    | CPU request for resources                      | `1`                                  |
-| `resources.requests.memory`                 | Memory request for resources                   | `1Gi`                                |
-| `nodeSelector`                              | Node selector                                  | `{}`                                 |
-| `tolerations`                               | Tolerations                                    | `{}`                                 |
-| `affinity`                                  | Affinity                                       | `{}`                                 |
+| Parameter                                  | Description                                    | Default                                             |
+|--------------------------------------------|------------------------------------------------|-----------------------------------------------------|
+| `nameOverride`                             | nameOverride                                   | ``                                                  |
+| `fullnameOverride`                         | fullnameOverride                               | ``                                                  |
+| `enablePrometheus`                         | enable Prometheus                              | `false`                                             |
+| `enableCertManager`                        | enable CertManager                             | `false`                                             |
+| `imagePullSecrets`                         | Image pull secrets                             | `[]`                                                |
+| `image.manager.repository`                 | Repository for manager image                   | `us-central1-docker.pkg.dev/k8s-staging-images/lws` |
+| `image.manager.tag`                        | Tag for manager image                          | `main`                                              |
+| `image.manager.pullPolicy`                 | Pull policy for manager image                  | `IfNotPresent`                                      |
+| `podAnnotations`                           | Annotations for pods                           | `{}`                                                |
+| `podSecurityContext.runAsNonRoot`          | Run pod as non-root user                       | `true`                                              |
+| `securityContext.allowPrivilegeEscalation` | Allow privilege escalation in security context | `false`                                             |
+| `securityContext.capabilities.drop`        | Drop all capabilities in security context      | `["ALL"]`                                           |
+| `service.type`                             | Type of lws controller service                 | `ClusterIP`                                         |
+| `service.port`                             | Lws controller service port                    | `9443`                                              |
+| `resources.requests.cpu`                   | CPU request for resources                      | `1`                                                 |
+| `resources.requests.memory`                | Memory request for resources                   | `1Gi`                                               |
+| `nodeSelector`                             | Node selector                                  | `{}`                                                |
+| `tolerations`                              | Tolerations                                    | `{}`                                                |
+| `affinity`                                 | Affinity                                       | `{}`                                                |
+| `gangSchedulingManagement`                 | Configuration for gang scheduling.             | `{}`                                                |
diff --git a/charts/lws/templates/manager/configmap.yaml b/charts/lws/templates/manager/configmap.yaml
@@ -14,3 +14,7 @@ data:
       leaderElect: true
     internalCertManagement:
       enable: {{ not .Values.enableCertManager }}
+    {{- if .Values.gangSchedulingManagement }}
+    gangSchedulingManagement:
+{{ toYaml .Values.gangSchedulingManagement | indent 6 }}
+    {{- end }}
diff --git a/charts/lws/templates/rbac/clusterrole.yaml b/charts/lws/templates/rbac/clusterrole.yaml
@@ -128,6 +128,17 @@ rules:
       - get
       - patch
       - update
+  {{- if and .Values.gangSchedulingManagement (hasKey .Values.gangSchedulingManagement "schedulerProvider") (eq .Values.gangSchedulingManagement.schedulerProvider "volcano") }}
+  - apiGroups:
+      - scheduling.volcano.sh
+    resources:
+      - podgroups
+    verbs:
+      - create
+      - get
+      - list
+      - watch
+  {{- end }}
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole

diff --git a/charts/lws/values.yaml b/charts/lws/values.yaml
@@ -48,3 +48,9 @@ resources:
 nodeSelector: {}
 tolerations: []
 affinity: {}
+
+# gangSchedulingManagement is configuration for gang scheduling management.
+# For example:
+# gangSchedulingManagement:
+#   schedulerProvider: volcano
+gangSchedulingManagement: {}
diff --git a/cmd/main.go b/cmd/main.go
@@ -34,11 +34,14 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+	volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
+
 	configapi "sigs.k8s.io/lws/api/config/v1alpha1"
 	leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
 	"sigs.k8s.io/lws/pkg/cert"
 	"sigs.k8s.io/lws/pkg/config"
 	"sigs.k8s.io/lws/pkg/controllers"
+	"sigs.k8s.io/lws/pkg/schedulerprovider"
 	"sigs.k8s.io/lws/pkg/utils"
 	"sigs.k8s.io/lws/pkg/utils/useragent"
 	"sigs.k8s.io/lws/pkg/version"
@@ -57,6 +60,7 @@ func init() {
 
 	utilruntime.Must(leaderworkersetv1.AddToScheme(scheme))
 	utilruntime.Must(configapi.AddToScheme(scheme))
+	utilruntime.Must(volcanov1beta1.AddToScheme(scheme))
 	//+kubebuilder:scaffold:scheme
 }
 
@@ -160,7 +164,7 @@ func main() {
 	// Cert won't be ready until manager starts, so start a goroutine here which
 	// will block until the cert is ready before setting up the controllers.
 	// Controllers who register after manager starts will start directly.
-	go setupControllers(mgr, certsReady)
+	go setupControllers(mgr, certsReady, cfg)
 
 	setupHealthzAndReadyzCheck(mgr)
 	setupLog.Info("starting manager")
@@ -171,7 +175,7 @@ func main() {
 	}
 
 }
-func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
+func setupControllers(mgr ctrl.Manager, certsReady chan struct{}, cfg configapi.Configuration) {
 	// The controllers won't work until the webhooks are operating,
 	// and the webhook won't work until the certs are all in places.
 	setupLog.Info("waiting for the cert generation to complete")
@@ -186,18 +190,31 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
 		setupLog.Error(err, "unable to create controller", "controller", "LeaderWorkerSet")
 		os.Exit(1)
 	}
+	// Set up scheduler provider
+	var sp schedulerprovider.SchedulerProvider
+	if cfg.GangSchedulingManagement != nil {
+		var err error
+		sp, err = schedulerprovider.NewSchedulerProvider(schedulerprovider.ProviderType(*cfg.GangSchedulingManagement.SchedulerProvider), mgr.GetClient())
+		if err != nil {
+			setupLog.Error(err, "unable to create scheduler provider", "provider", *cfg.GangSchedulingManagement.SchedulerProvider)
+			os.Exit(1)
+		}
+		setupLog.Info("Gang scheduling enabled", "provider", *cfg.GangSchedulingManagement.SchedulerProvider)
+	}
 	// Set up pod reconciler.
-	podController := controllers.NewPodReconciler(mgr.GetClient(), mgr.GetScheme(), mgr.GetEventRecorderFor("leaderworkerset"))
+	podController := controllers.NewPodReconciler(mgr.GetClient(), mgr.GetScheme(), mgr.GetEventRecorderFor("leaderworkerset"), sp)
 	if err := podController.SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "Pod")
 		os.Exit(1)
 	}
+	// Set up webhooks
 	if os.Getenv("ENABLE_WEBHOOKS") != "false" {
 		if err := webhooks.SetupLeaderWorkerSetWebhook(mgr); err != nil {
 			setupLog.Error(err, "unable to create leaderworkerset webhook", "webhook", "LeaderWorkerSet")
 			os.Exit(1)
 		}
-		if err := webhooks.SetupPodWebhook(mgr); err != nil {
+		pw := webhooks.NewPodWebhook(sp)
+		if err := pw.Setup(mgr); err != nil {
 			setupLog.Error(err, "unable to create pod webhook", "webhook", "LeaderWorkerSet")
 			os.Exit(1)
 		}

diff --git a/docs/examples/sample/gang-scheduling/README.md b/docs/examples/sample/gang-scheduling/README.md
@@ -0,0 +1,79 @@
+# Enabling Gang Scheduling with different Schedulers
+
+This document provides guidance about how to enable gang scheduling capabilities in LeaderWorkerSet with different schedulers.
+
+## Supported Schedulers
+
+| Custom Scheduler Community | GitHub ID       | Email                           |
+| -------------------------- |-----------------|---------------------------------|
+| Volcano (`@volcano-sh`)    | `@JesseStutler` | `jesseincomparable@hotmail.com` |
+| Your-Scheduler-Here        | `@your-id`      | `your@email.com`                |
+
+We welcome integrations for more custom schedulers that support gang scheduling capabilities. Getting started with:
+
+1.  Open an issue on the [LeaderWorkerSet GitHub repository](https://github.com/kubernetes-sigs/lws/issues) to discuss your proposal, tagging the LWS maintainers.
+2.  In the same pull request that adds the scheduler integration, update the maintainer matrix above with your contact information. This can help us contact the maintainer to regularly upgrade the custom scheduler dependencies.
+
+## Using Volcano for Gang Scheduling
+
+### Installation
+
+To enable gang scheduling, you must enable the feature flag and specify `volcano` as the provider during the LeaderWorkerSet controller installation.
+
+- **With Helm:**
+
+  Refer to the [install-by-helm](https://lws.sigs.k8s.io/docs/installation/#install-by-helm). When installing, add the following flags to your `helm install` command:
+  ```sh
+  helm install lws oci://ghcr.io/kubernetes-sigs/lws-charts/lws \
+    --set gangSchedulingManagement.schedulerProvider=volcano
+  ```
+
+- **With kubectl:**
+
+  Refer to the [install-by-kubectl](https://lws.sigs.k8s.io/docs/installation/#install-by-kubectl). Gang scheduling is **disabled by default**. To enable gang scheduling capabilities, you need to:
+
+  1. **Update the configuration ConfigMap** to enable gang scheduling settings:
+     ```yaml
+     apiVersion: v1
+     kind: ConfigMap
+     metadata:
+       name: lws-manager-config
+       namespace: lws-system
+     data:
+       controller_manager_config.yaml: |
+         apiVersion: config.lws.x-k8s.io/v1alpha1
+         kind: Configuration
+         leaderElection:
+           leaderElect: true
+         internalCertManagement:
+           enable: true
+         # Add gang scheduling configuration
+         gangSchedulingManagement:
+           schedulerProvider: volcano
+     ```
+
+  2. **Restart the lws-controller-manager** to apply the new configuration:
+     ```sh
+     kubectl rollout restart deployment/lws-controller-manager -n lws-system
+     ```
+
+     Or you can directly delete the pod to trigger a restart:
+     ```sh
+     kubectl delete pod -l control-plane=controller-manager -n lws-system
+     ```
+
+### Startup Policy Differences
+
+The `startupPolicy` in your LeaderWorkerSet spec determines the gang scheduling behavior by setting the `minMember` of the auto-generated `PodGroup`.
+
+- **`LeaderCreated` Policy (Default):** `MinMember` is set to the full replica size (1 leader + (size-1) workers). All pods in the replica must be scheduled together.
+- **`LeaderReady` Policy:** `MinMember` is set to 1, therefore the leader pod can be scheduled immediately and won't be blocked by the gang scheduler. 
+However, the `minResources` required by the `PodGroup` will still be calculated based on the entire group. 
+This ensures that resources for all pods are reserved before the leader starts, guaranteeing that the workers can eventually be scheduled.
+
+### Specifying a Custom Queue
+
+To assign all groups of a LeaderWorkerSet to a specific Volcano queue, add the `scheduling.volcano.sh/queue-name` annotation to the LeaderWorkerSet metadata.
+
+For a complete example, please refer to the [lws-sample-volcano.yaml](./lws-sample-volcano.yaml) file.
+
diff --git a/docs/examples/sample/gang-scheduling/lws-sample-volcano.yaml b/docs/examples/sample/gang-scheduling/lws-sample-volcano.yaml
@@ -0,0 +1,23 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  # annotations:
+  #   scheduling.volcano.sh/queue-name: lws-queue # Uncomment the annotation to set the queue name if needed
+  name: lws-sample-volcano
+spec:
+  replicas: 2
+  leaderWorkerTemplate:
+    size: 4
+    workerTemplate:
+      spec:
+        schedulerName: volcano # Need to set the scheduler name to Volcano
+        containers:
+        - name: nginx
+          image: nginxinc/nginx-unprivileged:1.27
+          resources:
+            limits:
+              cpu: "100m"
+            requests:
+              cpu: "50m"
+          ports:
+            - containerPort: 8080