Skip to content

Commit

Permalink
[RayService][Bug] Serve Service May Select Pods That Are Actually Unr…
Browse files Browse the repository at this point in the history
…eady for Serving Traffic (#1856)
  • Loading branch information
Yicheng-Lu-llll authored Jan 25, 2024
1 parent 0216b33 commit 2e8f532
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 13 deletions.
11 changes: 8 additions & 3 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,17 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r

// BuildPod a pod config
func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeType, rayStartParams map[string]string, headPort string, enableRayAutoscaler *bool, creator string, fqdnRayIP string, enableServeService bool) (aPod corev1.Pod) {
// For Worker Pod: Traffic readiness is determined by the readiness probe.
// Therefore, the RayClusterServingServiceLabelKey label is not utilized and should always be set to true.
// For Head Pod: Traffic readiness is determined by the value of the RayClusterServingServiceLabelKey label.
// Initially, set the label to false and let the rayservice controller to manage its value.
if enableServeService {
// TODO (kevin85421): In the current RayService implementation, we only add this label to a Pod after
// it passes the health check. The other option is to use the readiness probe to control it. This
// logic always add the label to the Pod no matter whether it is ready or not.
podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue
if rayNodeType == rayv1.HeadNode {
podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceFalse
}
}

pod := corev1.Pod{
TypeMeta: metav1.TypeMeta{
APIVersion: "v1",
Expand Down
18 changes: 10 additions & 8 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,11 @@ func TestBuildPod(t *testing.T) {
// Test head pod
podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0))
podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false)
pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true)

val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey]
assert.True(t, ok, "Expected serve label is not present")
assert.Equal(t, utils.EnableRayClusterServingServiceFalse, val, "Wrong serve label value")

// Check environment variables
rayContainer := pod.Spec.Containers[utils.RayContainerIndex]
Expand Down Expand Up @@ -386,7 +390,11 @@ func TestBuildPod(t *testing.T) {
podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0)
fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace)
podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379")
pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false)
pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, true)

val, ok = pod.Labels[utils.RayClusterServingServiceLabelKey]
assert.True(t, ok, "Expected serve label is not present")
assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value")

// Check environment variables
rayContainer = pod.Spec.Containers[utils.RayContainerIndex]
Expand All @@ -407,12 +415,6 @@ func TestBuildPod(t *testing.T) {
// Check Envs
rayContainer = pod.Spec.Containers[utils.RayContainerIndex]
checkContainerEnv(t, rayContainer, "TEST_ENV_NAME", "TEST_ENV_VALUE")

// Try to build pod for serve
pod = BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true)
val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey]
assert.True(t, ok, "Expected serve label is not present")
assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value")
}

func TestBuildPod_WithOverwriteCommand(t *testing.T) {
Expand Down
7 changes: 5 additions & 2 deletions ray-operator/controllers/ray/common/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,12 @@ func BuildServeService(rayService rayv1.RayService, rayCluster rayv1.RayCluster,
utils.RayOriginatedFromCRDLabelKey: utils.RayOriginatedFromCRDLabelValue(crdType),
utils.RayClusterServingServiceLabelKey: utils.GenerateServeServiceLabel(name),
}

selectorLabels := map[string]string{
utils.RayClusterLabelKey: rayCluster.Name,
utils.RayClusterServingServiceLabelKey: utils.EnableRayClusterServingServiceTrue,
utils.RayClusterLabelKey: rayCluster.Name,
}
if isRayService {
selectorLabels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue
}

default_name := utils.GenerateServeServiceName(name)
Expand Down

0 comments on commit 2e8f532

Please sign in to comment.