Open
Description
What happened + What you expected to happen
I'm serving ML models using Ray serve on Google Cloud.
After upgrading Ray version from 2.6.3 to 2.7.1, there is a bug when launching new worker nodes.
"file_mounts" are not being done, and also "initialization_commands" are not being executed on worker node.
Despite this, launching a new node proceeds regardless of the state. However the problem arises when I assign a serve actor to the node. Since the necessary files and setups are absent, the actor crashes.
Strangely, after several failed attempts to launch fresh nodes, the ssh commands are eventually executed successfully on an intermittent basis, and the actor is launched.
Versions / Dependencies
Python == 3.10
Ray == 2.7.1
Reproduction script
ray config is being generated as under
data = {
"cluster_name": f"stable-diffusion-{RAY_STAGE}",
"max_workers": 20,
"upscaling_speed": 1.0,
"idle_timeout_minutes": 1 if RAY_STAGE != "release" else 1,
"provider": {
"type": "gcp",
"region": "asia-northeast3",
"availability_zone": DIFFUSION_GOOGLE_CLOUD_ZONE,
"cache_stopped_nodes": False,
"project_id": #####
},
"auth": {
"ssh_user": "ubuntu",
},
"available_node_types": {
"ray_head_default": {
"resources": {"Head": 1.0},
"node_config": {
"machineType": "e2-standard-2"
if RAY_STAGE != "release"
else "e2-highcpu-16",
"disks": [
{
"boot": True,
"autoDelete": True,
"type": "PERSISTENT",
"initializeParams": {
"diskSizeGb": 20,
"sourceImage": #####
}
}
],
},
},
"ray_worker_default": { # This value can only contain lowercase letters, numeric characters, underscores and dashes.
"min_workers": 0 if RAY_STAGE != "release" else 1,
"max_workers": 1 if RAY_STAGE != "release" else 4,
"resources": {"CPU": 12.0, "GPU": 1.0},
"node_config": {
"machineType": "a2-highgpu-1g",
"disks": [
{
"boot": True,
"autoDelete": True,
"type": "PERSISTENT",
"initializeParams": {
"diskSizeGb": 40,
"sourceImage": #####
}
}
],
"scheduling": {
"onHostMaintenance": "TERMINATE"
},
"serviceAccounts": [
{
"email": "ray-autoscaler-sa-v1@#####.iam.gserviceaccount.com",
"scopes": [
"https://www.googleapis.com/auth/cloud-platform"
]
}
]
},
},
},
"head_node_type": "ray_head_default",
"file_mounts": {
"~/service": "./service",
},
"cluster_synced_files": [],
"file_mounts_sync_continuously": False,
"rsync_exclude": [
"**/.git",
"**/.git/**",
],
"rsync_filter": [
".gitignore",
],
"initialization_commands": [
],
"setup_commands": [
],
"head_setup_commands": [
f'echo "export STAGE={RAY_STAGE}" >> ~/.bashrc && echo "export RAY_STAGE={RAY_STAGE}" >> ~/.bashrc && echo "export EVENT_QUEUE={os.environ["EVENT_QUEUE"]}" >> ~/.bashrc && echo "export TRACE_ROUTE={os.environ["RAY_TRACE_ROUTE"]}" >> ~/.bashrc',
'echo "export HEAD_INSTANCE_NAME=$(curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/name")" >> ~/.bashrc',
'echo "export INSTANCE_IP=$(curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/access-configs/0/external-ip")" >> ~/.bashrc',
f'echo "export RAY_GRAFANA_IFRAME_HOST=http://${{INSTANCE_IP}}:3000" >> ~/.bashrc',
"source ~/.bashrc",
"cp -rp ./service/* ./",
],
"worker_setup_commands": [
f'echo "export STAGE={RAY_STAGE}" >> ~/.bashrc && echo "export RAY_STAGE={RAY_STAGE}" >> ~/.bashrc && echo "export EVENT_QUEUE={os.environ["EVENT_QUEUE"]}" >> ~/.bashrc && echo "export TRACE_ROUTE={os.environ["RAY_TRACE_ROUTE"]}" >> ~/.bashrc',
"source ~/.bashrc",
"cp -rp ./service/* ./",
'echo "export INSTANCE_NAME=$(curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/name")" >> ~/.bashrc',
],
"head_start_ray_commands": [
"ray stop",
"RAY_ROTATION_MAX_BYTES=256000 RAY_ROTATION_BACKUP_COUNT=0 ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0",
"sudo cp ray-prometheus-server.service /etc/systemd/system/ray-prometheus-server.service && sudo cp ray-grafana-server.service /etc/systemd/system/ray-grafana-server.service && sudo systemctl daemon-reload",
"sudo systemctl stop ray-prometheus-server.service && sudo systemctl start ray-prometheus-server.service",
"sudo systemctl stop ray-grafana-server.service && sudo systemctl start ray-grafana-server.service",
],
"worker_start_ray_commands": [
"ray stop",
"RAY_ROTATION_MAX_BYTES=256000 RAY_ROTATION_BACKUP_COUNT=0 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076",
f'gcloud compute instance-groups unmanaged add-instances {DIFFUSION_TARGET_INSTANCE_GROUP} --instances=$INSTANCE_NAME --zone={DIFFUSION_GOOGLE_CLOUD_ZONE} || true'
],
}
Issue Severity
High: It blocks me from completing my task.
Activity