Skip to content

Commit

Permalink
implement aks gpu instance profile (Azure#3895)
Browse files Browse the repository at this point in the history
* implement aks gpu instance profile

* exclude test from live run due to feature registration

* wrap enum opts with variable

* fix bad merge

* fix ordering
  • Loading branch information
alexeldeib authored Oct 2, 2021
1 parent 0277e85 commit 995397c
Show file tree
Hide file tree
Showing 7 changed files with 2,241 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
"test_aks_enable_monitoring_with_aad_auth_uai",
"test_aks_create_and_update_with_managed_nat_gateway_outbound",
"test_aks_create_with_http_proxy_config",
"test_aks_nodepool_add_with_workload_runtime"
"test_aks_nodepool_add_with_workload_runtime",
"test_aks_nodepool_add_with_gpu_instance_profile"
]
}
}
6 changes: 6 additions & 0 deletions src/aks-preview/azext_aks_preview/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,9 @@

CONST_MANAGED_IDENTITY_OPERATOR_ROLE = 'Managed Identity Operator'
CONST_MANAGED_IDENTITY_OPERATOR_ROLE_ID = 'f1a07417-d97a-45cb-824c-7a7467783830'

CONST_GPU_INSTANCE_PROFILE_MIG1_G = "MIG1g"
CONST_GPU_INSTANCE_PROFILE_MIG2_G = "MIG2g"
CONST_GPU_INSTANCE_PROFILE_MIG3_G = "MIG3g"
CONST_GPU_INSTANCE_PROFILE_MIG4_G = "MIG4g"
CONST_GPU_INSTANCE_PROFILE_MIG7_G = "MIG7g"
6 changes: 6 additions & 0 deletions src/aks-preview/azext_aks_preview/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,9 @@
- name: --workload-runtime
type: string
short-summary: Determines the type of workload a node can run. Defaults to OCIContainer.
- name: --gpu-instance-profile
type: string
short-summary: GPU instance profile to partition multi-gpu Nvidia GPUs.
examples:
- name: Create a Kubernetes cluster with an existing SSH public key.
text: az aks create -g MyResourceGroup -n MyManagedCluster --ssh-key-value /path/to/publickey
Expand Down Expand Up @@ -942,6 +945,9 @@
- name: --workload-runtime
type: string
short-summary: Determines the type of workload a node can run. Defaults to OCIContainer.
- name: --gpu-instance-profile
type: string
short-summary: GPU instance profile to partition multi-gpu Nvidia GPUs.
examples:
- name: Create a nodepool in an existing AKS cluster with ephemeral os enabled.
text: az aks nodepool add -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --node-osdisk-type Ephemeral --node-osdisk-size 48
Expand Down
4 changes: 4 additions & 0 deletions src/aks-preview/azext_aks_preview/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
CONST_OS_DISK_TYPE_MANAGED, CONST_OS_DISK_TYPE_EPHEMERAL, \
CONST_RAPID_UPGRADE_CHANNEL, CONST_STABLE_UPGRADE_CHANNEL, CONST_PATCH_UPGRADE_CHANNEL, CONST_NODE_IMAGE_UPGRADE_CHANNEL, CONST_NONE_UPGRADE_CHANNEL, \
CONST_WORKLOAD_RUNTIME_OCI_CONTAINER, CONST_WORKLOAD_RUNTIME_WASM_WASI
from ._consts import CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G, CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G, CONST_GPU_INSTANCE_PROFILE_MIG7_G

workload_runtimes = [CONST_WORKLOAD_RUNTIME_OCI_CONTAINER, CONST_WORKLOAD_RUNTIME_WASM_WASI]
gpu_instance_profiles = [CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G, CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G, CONST_GPU_INSTANCE_PROFILE_MIG7_G]


def load_arguments(self, _):
Expand Down Expand Up @@ -143,6 +145,7 @@ def load_arguments(self, _):
c.argument('enable_secret_rotation', action='store_true')
c.argument('assign_kubelet_identity', type=str, validator=validate_assign_kubelet_identity)
c.argument('disable_local_accounts', action='store_true')
c.argument('gpu_instance_profile', arg_type=get_enum_type(gpu_instance_profiles))
c.argument('yes', options_list=['--yes', '-y'], help='Do not prompt for confirmation.', action='store_true')
c.argument('workload_runtime', arg_type=get_enum_type(workload_runtimes), default=CONST_WORKLOAD_RUNTIME_OCI_CONTAINER)

Expand Down Expand Up @@ -238,6 +241,7 @@ def load_arguments(self, _):
c.argument('enable_encryption_at_host', options_list=['--enable-encryption-at-host'], action='store_true')
c.argument('enable_ultra_ssd', action='store_true')
c.argument('workload_runtime', arg_type=get_enum_type(workload_runtimes), default=CONST_WORKLOAD_RUNTIME_OCI_CONTAINER)
c.argument('gpu_instance_profile', arg_type=get_enum_type(gpu_instance_profiles))

for scope in ['aks nodepool show', 'aks nodepool delete', 'aks nodepool scale', 'aks nodepool upgrade', 'aks nodepool update']:
with self.argument_context(scope) as c:
Expand Down
8 changes: 6 additions & 2 deletions src/aks-preview/azext_aks_preview/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,7 @@ def aks_create(cmd, # pylint: disable=too-many-locals,too-many-statements,to
no_wait=False,
assign_kubelet_identity=None,
workload_runtime=None,
gpu_instance_profile=None,
yes=False):
if not no_ssh_key:
try:
Expand Down Expand Up @@ -870,7 +871,8 @@ def aks_create(cmd, # pylint: disable=too-many-locals,too-many-statements,to
enable_ultra_ssd=enable_ultra_ssd,
max_pods=int(max_pods) if max_pods else None,
type=vm_set_type,
workload_runtime=workload_runtime
workload_runtime=workload_runtime,
gpu_instance_profile=gpu_instance_profile
)

if node_osdisk_size:
Expand Down Expand Up @@ -2378,6 +2380,7 @@ def aks_agentpool_add(cmd, # pylint: disable=unused-argument,too-many-local
enable_encryption_at_host=False,
enable_ultra_ssd=False,
workload_runtime=None,
gpu_instance_profile=None,
no_wait=False):
instances = client.list(resource_group_name, cluster_name)
for agentpool_profile in instances:
Expand Down Expand Up @@ -2432,7 +2435,8 @@ def aks_agentpool_add(cmd, # pylint: disable=unused-argument,too-many-local
enable_encryption_at_host=enable_encryption_at_host,
enable_ultra_ssd=enable_ultra_ssd,
mode=mode,
workload_runtime=workload_runtime
workload_runtime=workload_runtime,
gpu_instance_profile=gpu_instance_profile
)

if priority == CONST_SCALE_SET_PRIORITY_SPOT:
Expand Down
Loading

0 comments on commit 995397c

Please sign in to comment.