Skip to content

[RayService][Bug] Serve Service May Select Pods That Are Actually Unready for Serving Traffic #1856

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,17 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r

// BuildPod a pod config
func BuildPod(podTemplateSpec corev1.PodTemplateSpec, rayNodeType rayv1.RayNodeType, rayStartParams map[string]string, headPort string, enableRayAutoscaler *bool, creator string, fqdnRayIP string, enableServeService bool) (aPod corev1.Pod) {
// For Worker Pod: Traffic readiness is determined by the readiness probe.
// Therefore, the RayClusterServingServiceLabelKey label is not utilized and should always be set to true.
// For Head Pod: Traffic readiness is determined by the value of the RayClusterServingServiceLabelKey label.
// Initially, set the label to false and let the rayservice controller to manage its value.
if enableServeService {
// TODO (kevin85421): In the current RayService implementation, we only add this label to a Pod after
// it passes the health check. The other option is to use the readiness probe to control it. This
// logic always add the label to the Pod no matter whether it is ready or not.
podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discuss offline: Removing the label would result in the worker Pods lacking this label, preventing them from attaching to the K8s service managed by the RayService.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for pointing out! I have correct it in 6de9391. Below is the test:

kubectl apply -f /home/ubuntu/kuberay/ray-operator/config/samples/ray_v1alpha1_rayservice.yaml
kubectl describe rayservice | grep "Service Status"
# Service Status:  Running
kubectl describe svc rayservice-sample-serve-svc
# Endpoints:         10.244.0.6:8000,10.244.0.7:8000
kubectl describe $(kubectl get pods -o=name | grep worker) | grep "ray.io/serve"
# ray.io/serve=true
kubectl describe $(kubectl get pods -o=name | grep head) | grep "ray.io/serve"
# ray.io/serve=true

if rayNodeType == rayv1.HeadNode {
podTemplateSpec.Labels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceFalse
}
}

pod := corev1.Pod{
TypeMeta: metav1.TypeMeta{
APIVersion: "v1",
Expand Down
18 changes: 10 additions & 8 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,11 @@ func TestBuildPod(t *testing.T) {
// Test head pod
podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0))
podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", false)
pod := BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true)

val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey]
assert.True(t, ok, "Expected serve label is not present")
assert.Equal(t, utils.EnableRayClusterServingServiceFalse, val, "Wrong serve label value")

// Check environment variables
rayContainer := pod.Spec.Containers[utils.RayContainerIndex]
Expand Down Expand Up @@ -386,7 +390,11 @@ func TestBuildPod(t *testing.T) {
podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0)
fqdnRayIP := utils.GenerateFQDNServiceName(*cluster, cluster.Namespace)
podTemplateSpec = DefaultWorkerPodTemplate(*cluster, worker, podName, fqdnRayIP, "6379")
pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, false)
pod = BuildPod(podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", nil, "", fqdnRayIP, true)

val, ok = pod.Labels[utils.RayClusterServingServiceLabelKey]
assert.True(t, ok, "Expected serve label is not present")
assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value")

// Check environment variables
rayContainer = pod.Spec.Containers[utils.RayContainerIndex]
Expand All @@ -407,12 +415,6 @@ func TestBuildPod(t *testing.T) {
// Check Envs
rayContainer = pod.Spec.Containers[utils.RayContainerIndex]
checkContainerEnv(t, rayContainer, "TEST_ENV_NAME", "TEST_ENV_VALUE")

// Try to build pod for serve
pod = BuildPod(podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", nil, "", "", true)
val, ok := pod.Labels[utils.RayClusterServingServiceLabelKey]
assert.True(t, ok, "Expected serve label is not present")
assert.Equal(t, utils.EnableRayClusterServingServiceTrue, val, "Wrong serve label value")
}

func TestBuildPod_WithOverwriteCommand(t *testing.T) {
Expand Down
7 changes: 5 additions & 2 deletions ray-operator/controllers/ray/common/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,12 @@ func BuildServeService(rayService rayv1.RayService, rayCluster rayv1.RayCluster,
utils.RayOriginatedFromCRDLabelKey: utils.RayOriginatedFromCRDLabelValue(crdType),
utils.RayClusterServingServiceLabelKey: utils.GenerateServeServiceLabel(name),
}

selectorLabels := map[string]string{
utils.RayClusterLabelKey: rayCluster.Name,
utils.RayClusterServingServiceLabelKey: utils.EnableRayClusterServingServiceTrue,
utils.RayClusterLabelKey: rayCluster.Name,
}
if isRayService {
selectorLabels[utils.RayClusterServingServiceLabelKey] = utils.EnableRayClusterServingServiceTrue
}

default_name := utils.GenerateServeServiceName(name)
Expand Down