Skip to content

Commit 119ac68

Browse files
authored
Disabling CloudDNS upgrades, while ensuring backward compatibility. (#241)
* Disabling CloudDNS upgrades, while ensuring backward compatibility. * Update kubectl port-forward instructions. * Remove clouddns checks also.
1 parent c49bf61 commit 119ac68

File tree

5 files changed

+19
-204
lines changed

5 files changed

+19
-204
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,14 @@ will fail the cluster creation process because Vertex AI Tensorboard is not supp
421421
--cluster xpk-pw-test
422422
```
423423
Executing the command above would provide the address of the proxy that the user job should connect to.
424-
Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=<proxy address from above>` and `import previewutilies` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks!
424+
```shell
425+
kubectl get pods
426+
kubectl port-forward pod/<proxy-pod-name> 29000:29000
427+
```
428+
```shell
429+
JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 python -c 'import pathwaysutils; import jax; print(jax.devices())'
430+
```
431+
Specify `JAX_PLATFORMS=proxy` and `JAX_BACKEND_TARGET=<proxy address from above>` and `import pathwaysutils` to establish this connection between the user's JAX code and the Pathways proxy. Execute Pathways workloads interactively on Vertex AI notebooks!
425432
426433
### Set `max-restarts` for production jobs
427434

src/xpk/commands/cluster.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
run_gke_node_pool_create_command,
3636
set_jobset_on_cluster,
3737
set_up_cluster_network_for_gpu,
38-
update_cluster_with_clouddns_if_necessary,
3938
zone_to_region,
4039
)
4140
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
@@ -101,13 +100,7 @@ def cluster_create(args) -> None:
101100
if authorize_private_cluster_access_command_code != 0:
102101
xpk_exit(authorize_private_cluster_access_command_code)
103102

104-
# Update Pathways clusters with CloudDNS if not enabled already.
105-
if args.enable_pathways:
106-
update_cluster_command_code = update_cluster_with_clouddns_if_necessary(
107-
args
108-
)
109-
if update_cluster_command_code != 0:
110-
xpk_exit(update_cluster_command_code)
103+
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
111104

112105
set_cluster_command_code = set_cluster_command(args)
113106
if set_cluster_command_code != 0:
@@ -506,12 +499,7 @@ def run_gke_cluster_create_command(
506499

507500
if args.enable_pathways:
508501
enable_ip_alias = True
509-
command += (
510-
f' --create-subnetwork name={args.cluster}-subnetwork'
511-
' --cluster-dns=clouddns'
512-
' --cluster-dns-scope=vpc'
513-
f' --cluster-dns-domain={args.cluster}-domain'
514-
)
502+
command += f' --create-subnetwork name={args.cluster}-subnetwork'
515503

516504
if enable_ip_alias:
517505
command += ' --enable-ip-alias'

src/xpk/commands/workload.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
get_gpu_volume,
4040
get_user_workload_container,
4141
get_volumes,
42-
is_cluster_using_clouddns,
4342
parse_env_config,
4443
wait_for_job_completion,
4544
xpk_current_version,
@@ -325,12 +324,13 @@ def workload_create(args) -> None:
325324
"""
326325
add_zone_and_project(args)
327326

328-
if args.headless and not is_cluster_using_clouddns(args):
327+
if args.headless:
329328
xpk_print(
330-
'Please run xpk cluster create-pathways first, to upgrade and enable'
331-
' CloudDNS on your cluster.'
329+
'Please use kubectl port forwarding to connect to the Pathways proxy.'
330+
' kubectl get pods kubectl port-forward <proxy-pod-name> 29000:29000'
331+
' JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 python'
332+
" -c 'import pathwaysutils; import jax; print(jax.devices())'"
332333
)
333-
xpk_exit(1)
334334

335335
set_cluster_command_code = set_cluster_command(args)
336336
if set_cluster_command_code != 0:

src/xpk/core/core.py

Lines changed: 0 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -333,110 +333,6 @@ def get_total_chips_requested_from_args(
333333
return num_chips
334334

335335

336-
def update_gke_cluster_with_clouddns(args) -> int:
337-
"""Run the GKE cluster update command for existing clusters and enable CloudDNS.
338-
339-
Args:
340-
args: user provided arguments for running the command.
341-
342-
Returns:
343-
0 if successful and 1 otherwise.
344-
"""
345-
command = (
346-
'gcloud container clusters update'
347-
f' {args.cluster} --project={args.project}'
348-
f' --region={zone_to_region(args.zone)}'
349-
' --cluster-dns=clouddns'
350-
' --cluster-dns-scope=vpc'
351-
f' --cluster-dns-domain={args.cluster}-domain'
352-
' --quiet'
353-
)
354-
xpk_print('Updating GKE cluster to use Cloud DNS, may take a while!')
355-
return_code = run_command_with_updates(
356-
command, 'GKE Cluster Update to enable Cloud DNS', args
357-
)
358-
if return_code != 0:
359-
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
360-
return 1
361-
return 0
362-
363-
364-
def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
365-
"""Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
366-
367-
Args:
368-
args: user provided arguments for running the command.
369-
default_rapid_gke_version: Rapid default version for the upgrade.
370-
371-
Returns:
372-
0 if successful and 1 otherwise.
373-
"""
374-
command = (
375-
'gcloud container clusters upgrade'
376-
f' {args.cluster} --project={args.project}'
377-
f' --region={zone_to_region(args.zone)}'
378-
f' --cluster-version={default_rapid_gke_version}'
379-
' --master'
380-
' --quiet'
381-
)
382-
xpk_print("Updating GKE cluster's control plane version, may take a while!")
383-
return_code = run_command_with_updates(
384-
command,
385-
'GKE Cluster control plane version update to enable Cloud DNS',
386-
args,
387-
)
388-
if return_code != 0:
389-
xpk_print(
390-
"GKE cluster's control plane version update request returned"
391-
f' ERROR {return_code}'
392-
)
393-
return 1
394-
return 0
395-
396-
397-
def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
398-
"""Upgrade nodepools in the cluster to default rapid gke version. Recreates the nodes.
399-
400-
Args:
401-
args: user provided arguments for running the command.
402-
default_rapid_gke_version: Rapid default version for the upgrade.
403-
404-
Returns:
405-
0 if successful and 1 otherwise.
406-
"""
407-
existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
408-
if return_code != 0:
409-
xpk_print('Listing all node pools failed!')
410-
return return_code
411-
412-
# Batch execution to upgrade node pools simultaneously
413-
commands = []
414-
task_names = []
415-
for node_pool_name in existing_node_pool_names:
416-
commands.append(
417-
'gcloud container clusters upgrade'
418-
f' {args.cluster} --project={args.project}'
419-
f' --region={zone_to_region(args.zone)}'
420-
f' --cluster-version={default_rapid_gke_version}'
421-
f' --node-pool={node_pool_name}'
422-
' --quiet'
423-
)
424-
task_names.append(f'Upgrading node pool {node_pool_name}.')
425-
426-
for i, command in enumerate(commands):
427-
xpk_print(f'To complete {task_names[i]} we are executing {command}')
428-
max_return_code = run_commands(
429-
commands, 'Update GKE node pools to default RAPID GKE version', task_names
430-
)
431-
if max_return_code != 0:
432-
xpk_print(
433-
'GKE node pools update to default RAPID GKE version returned ERROR:'
434-
f' {max_return_code}'
435-
)
436-
return max_return_code
437-
return 0
438-
439-
440336
def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
441337
"""Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
442338
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
@@ -1019,73 +915,6 @@ def get_all_clusters_programmatic(args) -> tuple[list[str], int]:
1019915
return raw_cluster_output.splitlines(), 0
1020916

1021917

1022-
def is_cluster_using_clouddns(args) -> bool:
1023-
"""Checks if cluster is using CloudDNS.
1024-
Args:
1025-
args: user provided arguments for running the command.
1026-
1027-
Returns:
1028-
True if cluster is using CloudDNS and False otherwise.
1029-
"""
1030-
command = (
1031-
f'gcloud container clusters describe {args.cluster}'
1032-
f' --project={args.project} --region={zone_to_region(args.zone)}'
1033-
' | grep "clusterDns: CLOUD_DNS"'
1034-
)
1035-
return_code, _ = run_command_for_value(
1036-
command,
1037-
'Check if Cloud DNS is enabled in cluster describe.',
1038-
args,
1039-
)
1040-
if return_code == 0:
1041-
xpk_print('Cloud DNS is enabled on the cluster, no update needed.')
1042-
return True
1043-
return False
1044-
1045-
1046-
def update_cluster_with_clouddns_if_necessary(args) -> int:
1047-
"""Updates a GKE cluster to use CloudDNS, if not enabled already.
1048-
1049-
Args:
1050-
args: user provided arguments for running the command.
1051-
1052-
Returns:
1053-
0 if successful and error code otherwise.
1054-
"""
1055-
all_clusters, return_code = get_all_clusters_programmatic(args)
1056-
if return_code > 0:
1057-
xpk_print('Listing all clusters failed!')
1058-
return 1
1059-
if args.cluster in all_clusters:
1060-
# If cluster is already using clouddns, no update necessary!
1061-
if is_cluster_using_clouddns(args):
1062-
return 0
1063-
cluster_update_return_code = update_gke_cluster_with_clouddns(args)
1064-
if cluster_update_return_code > 0:
1065-
xpk_print('Updating GKE cluster to use CloudDNS failed!')
1066-
return cluster_update_return_code
1067-
1068-
# Find default rapid control plane version and update the control plane to the same.
1069-
server_config_return_code, gke_server_config = get_gke_server_config(args)
1070-
if server_config_return_code != 0:
1071-
xpk_exit(server_config_return_code)
1072-
upgrade_master_return_code = upgrade_gke_control_plane_version(
1073-
args, gke_server_config.default_rapid_gke_version
1074-
)
1075-
if upgrade_master_return_code > 0:
1076-
xpk_print("Updating GKE cluster's control plane upgrade failed!")
1077-
return upgrade_master_return_code
1078-
1079-
# Upgrade nodepools version after the master upgrade.
1080-
node_pool_update_code = upgrade_gke_nodepools_version(
1081-
args, gke_server_config.default_rapid_gke_version
1082-
)
1083-
if node_pool_update_code > 0:
1084-
xpk_print('Upgrading nodepools version failed!')
1085-
return node_pool_update_code
1086-
return 0
1087-
1088-
1089918
def get_nodepool_zone(args, nodepool_name) -> tuple[int, str]:
1090919
"""Return zone in which nodepool exists in the cluster.
1091920

src/xpk/core/pathways.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
AcceleratorType,
2020
get_all_nodepools_programmatic,
2121
get_user_workload_container,
22-
is_cluster_using_clouddns,
2322
zone_to_region,
2423
)
2524
from .system_characteristics import SystemCharacteristics
@@ -257,34 +256,26 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
257256

258257

259258
def get_rm_address(args) -> str:
260-
"""Generates the Pathways resource manager address based on whether CloudDNS is enabled or not.
259+
"""Generates the Pathways resource manager address.
261260
Args:
262261
args: user provided arguments for running the command.
263262
264263
Returns:
265264
str: Fully qualified RM address.
266265
"""
267-
suffix = ''
268-
if is_cluster_using_clouddns(args):
269-
suffix = f'.default.svc.{args.cluster}-domain.'
270-
rm_address = f'{args.workload}-rm-0-0.{args.workload}{suffix}:29001'
266+
rm_address = f'{args.workload}-rm-0-0.{args.workload}:29001'
271267
return rm_address
272268

273269

274270
def get_proxy_address(args) -> str:
275-
"""Generates the Pathways proxy address based on whether CloudDNS is enabled or not.
271+
"""Generates the Pathways proxy address.
276272
Args:
277273
args: user provided arguments for running the command.
278274
279275
Returns:
280276
str: Fully qualified proxy address.
281277
"""
282-
suffix = ''
283-
if is_cluster_using_clouddns(args):
284-
suffix = f'.default.svc.{args.cluster}-domain.'
285-
proxy_address = (
286-
f'grpc://{args.workload}-proxy-0-0.{args.workload}{suffix}:29000'
287-
)
278+
proxy_address = f'grpc://{args.workload}-proxy-0-0.{args.workload}:29000'
288279
return proxy_address
289280

290281

0 commit comments

Comments
 (0)