From e060dc2de2251cf883de9b43dda9c73e448e6cbd Mon Sep 17 00:00:00 2001 From: Chen Jing Date: Tue, 24 Oct 2023 00:42:36 +0800 Subject: [PATCH] [cluster launcher] [vSphere Provider] Fix vc conn timout issue (#40516) Fixed the issue using SessionOrientedStub. A session-oriented stub adapter that will relogin to the destination if a session-oriented exception is thrown. --------- Signed-off-by: Chen Jing --- .../_private/vsphere/node_provider.py | 50 +++------- .../_private/vsphere/pyvmomi_sdk_provider.py | 99 +++++++++++-------- .../vsphere/test_vsphere_node_provider.py | 2 +- 3 files changed, 73 insertions(+), 78 deletions(-) diff --git a/python/ray/autoscaler/_private/vsphere/node_provider.py b/python/ray/autoscaler/_private/vsphere/node_provider.py index 21c37fa6e769a..009ba972a1107 100755 --- a/python/ray/autoscaler/_private/vsphere/node_provider.py +++ b/python/ray/autoscaler/_private/vsphere/node_provider.py @@ -89,15 +89,9 @@ def check_frozen_vm_status(self, frozen_vm_name): existing and in the frozen state. If the frozen VM is existing and off, this function will also help to power on the frozen VM and wait until it is frozen. """ - vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.VirtualMachine], frozen_vm_name ) - if vm is None: - raise ValueError( - "The frozen VM {} doesn't exist on vSphere, please contact the VI " - "admin".format(frozen_vm_name) - ) - logger.info(f"Found frozen VM with name: {vm._moId}") if vm.runtime.powerState == vim.VirtualMachinePowerState.poweredOff: logger.debug(f"Frozen VM {vm._moId} is off. Powering it ON") @@ -188,8 +182,8 @@ def node_tags(self, node_id): def external_ip(self, node_id): # Return the external IP of the VM # Fetch vSphere VM object - vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_moid( - [vim.VirtualMachine], node_id + vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj( + [vim.VirtualMachine], obj_id=node_id ) # Get the external IP address of this VM for ipaddr in vm.guest.net[0].ipAddress: @@ -339,7 +333,7 @@ def remove_tag_from_vm(self, tag_key_to_remove, vm_id): break def get_frozen_vm_obj(self): - vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.VirtualMachine], self.frozen_vm_name ) return vm @@ -375,7 +369,7 @@ def create_instant_clone_node(self, source_vm, vm_name_target, node_config, tags # If resource pool is not provided in the config yaml, then the resource pool # of the frozen VM will also be the resource pool of the new VM. resource_pool = ( - self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.ResourcePool], node_config["resource_pool"] ) if "resource_pool" in node_config and node_config["resource_pool"] @@ -384,7 +378,7 @@ def create_instant_clone_node(self, source_vm, vm_name_target, node_config, tags # If datastore is not provided in the config yaml, then the datastore # of the frozen VM will also be the resource pool of the new VM. datastore = ( - self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.Datastore], node_config["datastore"] ) if "datastore" in node_config and node_config["datastore"] @@ -413,7 +407,7 @@ def create_instant_clone_node(self, source_vm, vm_name_target, node_config, tags threading.Thread(target=self.tag_vm, args=(vm_name_target, tags)).start() WaitForTask(parent_vm.InstantClone_Task(spec=instant_clone_spec)) - cloned_vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + cloned_vm = self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.VirtualMachine], vm_name_target ) @@ -448,7 +442,7 @@ def create_frozen_vm_on_each_host(self, node_config, name, wait_until_frozen=Fal exception_happened = False vm_names = [] - res_pool = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + res_pool = self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.ResourcePool], node_config["frozen_vm"]["resource_pool"] ) # In vSphere, for any user-created resource pool, the cluster object is the @@ -506,13 +500,9 @@ def create_frozen_vm_from_ovf( "The datastore name must be provided when deploying frozen" "VM from OVF" ) - datastore_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + datastore_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.Datastore], datastore_name ) - if not datastore_mo: - raise ValueError( - f"Cannot find the vSphere datastore by name {datastore_name}" - ) datastore_id = datastore_mo._moId if node_config.get("frozen_vm").get("resource_pool"): rp_filter_spec = ResourcePool.FilterSpec( @@ -534,13 +524,9 @@ def create_frozen_vm_from_ovf( "The cluster name must be provided when deploying a single frozen" " VM from OVF" ) - cluster_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( + cluster_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj( [vim.ClusterComputeResource], cluster_name ) - if not cluster_mo: - raise ValueError( - f"Cannot find the vSphere cluster by name {cluster_name}" - ) node_config["host_id"] = cluster_mo.host[0]._moId resource_pool_id = cluster_mo.resourcePool._moId @@ -627,9 +613,7 @@ def create_frozen_vm_from_ovf( # Get the created vm object vm = self.get_vm(result.resource_id.id) - vm_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( - [vim.VirtualMachine], vm.name - ) + vm_mo = self.pyvmomi_sdk_provider.get_pyvmomi_obj([vim.VirtualMachine], vm.name) if wait_until_frozen: self.wait_until_vm_is_frozen(vm_mo) @@ -669,17 +653,9 @@ def initialize_frozen_vm_scheduler(self, frozen_vm_config): if "schedule_policy" in self.vsphere_config else "" ) - self.frozen_vms_resource_pool = ( - self.pyvmomi_sdk_provider.get_pyvmomi_obj_by_name( - [vim.ResourcePool], self.frozen_vm_resource_pool_name - ) + self.frozen_vms_resource_pool = self.pyvmomi_sdk_provider.get_pyvmomi_obj( + [vim.ResourcePool], self.frozen_vm_resource_pool_name ) - - if self.frozen_vms_resource_pool is None: - raise RuntimeError( - f"Resource Pool {self.frozen_vm_resource_pool_name} could not be found." - ) - # Make all frozen vms on resource pool are power on and frozen self.check_frozen_vms_status(self.frozen_vms_resource_pool) diff --git a/python/ray/autoscaler/_private/vsphere/pyvmomi_sdk_provider.py b/python/ray/autoscaler/_private/vsphere/pyvmomi_sdk_provider.py index f6c7816154da2..ce7bc37ae89b2 100755 --- a/python/ray/autoscaler/_private/vsphere/pyvmomi_sdk_provider.py +++ b/python/ray/autoscaler/_private/vsphere/pyvmomi_sdk_provider.py @@ -1,29 +1,51 @@ import atexit import ssl -from pyVim.connect import Disconnect, SmartConnect +from pyVim.connect import Disconnect, SmartStubAdapter, VimSessionOrientedStub +from pyVmomi import vim from ray.autoscaler._private.vsphere.utils import Constants class PyvmomiSdkProvider: - def __init__(self, server, user, password, session_type: Constants.SessionType): + def __init__( + self, + server, + user, + password, + session_type: Constants.SessionType, + port: int = 443, + ): + # Instance variables self.server = server self.user = user self.password = password self.session_type = session_type + self.port = port + # Instance parameters if self.session_type == Constants.SessionType.UNVERIFIED: context_obj = ssl._create_unverified_context() + self.timeout = 0 - smart_connect_obj = SmartConnect( + # Connect using a session oriented connection + # Ref. https://github.com/vmware/pyvmomi/issues/347 + self.pyvmomi_sdk_client = None + credentials = VimSessionOrientedStub.makeUserLoginMethod(user, password) + smart_stub = SmartStubAdapter( host=server, - user=user, - pwd=password, + port=port, sslContext=context_obj, + connectionPoolTimeout=self.timeout, ) - atexit.register(Disconnect, smart_connect_obj) - self.pyvmomi_sdk_client = smart_connect_obj.content + self.session_stub = VimSessionOrientedStub(smart_stub, credentials) + self.pyvmomi_sdk_client = vim.ServiceInstance( + "ServiceInstance", self.session_stub + ) + + if not self.pyvmomi_sdk_client: + raise ValueError("Could not connect to the specified host") + atexit.register(Disconnect, self.pyvmomi_sdk_client) def get_pyvmomi_obj_by_moid(self, vimtype, moid): """ @@ -35,52 +57,49 @@ def get_pyvmomi_obj_by_moid(self, vimtype, moid): """ obj = None if self.pyvmomi_sdk_client is None: - raise ValueError("Must init pyvmomi_sdk_client first.") + raise RuntimeError("Must init pyvmomi_sdk_client first.") + + if not moid: + raise ValueError("Invalid argument for moid") - container = self.pyvmomi_sdk_client.viewManager.CreateContainerView( - self.pyvmomi_sdk_client.rootFolder, vimtype, True + container = self.pyvmomi_sdk_client.content.viewManager.CreateContainerView( + self.pyvmomi_sdk_client.content.rootFolder, vimtype, True ) for c in container.view: - if moid: - if moid in str(c): - obj = c - break - else: + if moid in str(c): obj = c break - if not obj: - raise RuntimeError( - f"Unexpected: cannot find vSphere object {vimtype} with moid: {moid}" - ) + return obj - def get_pyvmomi_obj_by_name(self, vimtype, name): + def get_pyvmomi_obj(self, vimtype, name=None, obj_id=None): """ - This function finds the vSphere object by the object name and the object type. - The object type can be "VM", "Host", "Datastore", etc. - The object name is a unique name under the vCenter server. + This function will return the vSphere object. + The argument for `vimtype` can be "vim.VM", "vim.Host", "vim.Datastore", etc. + Then either the name or the object id need to be provided. To check all such object information, you can go to the managed object board page of your vCenter Server, such as: https://