Skip to content

Commit 2e8f532

Browse files
[RayService][Bug] Serve Service May Select Pods That Are Actually Unready for Serving Traffic (#1856)
1 parent 0216b33 commit 2e8f532

File tree

3 files changed

+23
-13
lines changed

3 files changed

+23
-13
lines changed

ray-operator/controllers/ray/common/pod.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,12 +293,17 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
293293

294294
// BuildPod a pod config
295295
func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeType, rayStartParams map[string]string, headPort string, enableRayAutoscaler *bool, creator string, fqdnRayIP string, enableServeService bool) (aPod corev1.Pod) {
296+
// For Worker Pod: Traffic readiness is determined by the readiness probe.
297+
// Therefore, the RayClusterServingServiceLabelKey label is not utilized and should always be set to true.
298+
// For Head Pod: Traffic readiness is determined by the value of the RayClusterServingServiceLabelKey label.
299+
// Initially, set the label to false and let the rayservice controller to manage its value.
296300
if enableServeService {
297-
// TODO (kevin85421): In the current RayService implementation, we only add this label to a Pod after
298-
// it passes the health check. The other option is to use the readiness probe to control it. This
299-
// logic always add the label to the Pod no matter whether it is ready or not.
300301
podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue
302+
if rayNodeType == rayv1.HeadNode {
303+
podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceFalse
304+
}
301305
}
306+
302307
pod := corev1.Pod{
303308
TypeMeta: metav1.TypeMeta{
304309
APIVersion: "v1",

ray-operator/controllers/ray/common/pod_test.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,11 @@ func TestBuildPod(t *testing.T) {
333333
// Test head pod
334334
podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0))
335335
podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
336-
pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false)
336+
pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true)
337+
338+
val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey]
339+
assert.True(t, ok, "Expected serve label is not present")
340+
assert.Equal(t, utils.EnableRayClusterServingServiceFalse, val, "Wrong serve label value")
337341

338342
// Check environment variables
339343
rayContainer := pod.Spec.Containers[utils.RayContainerIndex]
@@ -386,7 +390,11 @@ func TestBuildPod(t *testing.T) {
386390
podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0)
387391
fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace)
388392
podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379")
389-
pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false)
393+
pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, true)
394+
395+
val, ok = pod.Labels[utils.RayClusterServingServiceLabelKey]
396+
assert.True(t, ok, "Expected serve label is not present")
397+
assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value")
390398

391399
// Check environment variables
392400
rayContainer = pod.Spec.Containers[utils.RayContainerIndex]
@@ -407,12 +415,6 @@ func TestBuildPod(t *testing.T) {
407415
// Check Envs
408416
rayContainer = pod.Spec.Containers[utils.RayContainerIndex]
409417
checkContainerEnv(t, rayContainer, "TEST_ENV_NAME", "TEST_ENV_VALUE")
410-
411-
// Try to build pod for serve
412-
pod = BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true)
413-
val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey]
414-
assert.True(t, ok, "Expected serve label is not present")
415-
assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value")
416418
}
417419

418420
func TestBuildPod_WithOverwriteCommand(t *testing.T) {

ray-operator/controllers/ray/common/service.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,12 @@ func BuildServeService(rayService rayv1.RayService, rayCluster rayv1.RayCluster,
178178
utils.RayOriginatedFromCRDLabelKey: utils.RayOriginatedFromCRDLabelValue(crdType),
179179
utils.RayClusterServingServiceLabelKey: utils.GenerateServeServiceLabel(name),
180180
}
181+
181182
selectorLabels := map[string]string{
182-
utils.RayClusterLabelKey: rayCluster.Name,
183-
utils.RayClusterServingServiceLabelKey: utils.EnableRayClusterServingServiceTrue,
183+
utils.RayClusterLabelKey: rayCluster.Name,
184+
}
185+
if isRayService {
186+
selectorLabels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue
184187
}
185188

186189
default_name := utils.GenerateServeServiceName(name)

0 commit comments

Comments
 (0)