Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement aks gpu instance profile #3895

Merged
merged 5 commits into from
Oct 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
"test_aks_enable_monitoring_with_aad_auth_uai",
"test_aks_create_and_update_with_managed_nat_gateway_outbound",
"test_aks_create_with_http_proxy_config",
"test_aks_nodepool_add_with_workload_runtime"
"test_aks_nodepool_add_with_workload_runtime",
"test_aks_nodepool_add_with_gpu_instance_profile"
]
}
}
6 changes: 6 additions & 0 deletions src/aks-preview/azext_aks_preview/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,9 @@

CONST_MANAGED_IDENTITY_OPERATOR_ROLE = 'Managed Identity Operator'
CONST_MANAGED_IDENTITY_OPERATOR_ROLE_ID = 'f1a07417-d97a-45cb-824c-7a7467783830'

CONST_GPU_INSTANCE_PROFILE_MIG1_G = "MIG1g"
CONST_GPU_INSTANCE_PROFILE_MIG2_G = "MIG2g"
CONST_GPU_INSTANCE_PROFILE_MIG3_G = "MIG3g"
CONST_GPU_INSTANCE_PROFILE_MIG4_G = "MIG4g"
CONST_GPU_INSTANCE_PROFILE_MIG7_G = "MIG7g"
6 changes: 6 additions & 0 deletions src/aks-preview/azext_aks_preview/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,9 @@
- name: --workload-runtime
type: string
short-summary: Determines the type of workload a node can run. Defaults to OCIContainer.
- name: --gpu-instance-profile
type: string
short-summary: GPU instance profile to partition multi-gpu Nvidia GPUs.
examples:
- name: Create a Kubernetes cluster with an existing SSH public key.
text: az aks create -g MyResourceGroup -n MyManagedCluster --ssh-key-value /path/to/publickey
Expand Down Expand Up @@ -942,6 +945,9 @@
- name: --workload-runtime
type: string
short-summary: Determines the type of workload a node can run. Defaults to OCIContainer.
- name: --gpu-instance-profile
type: string
short-summary: GPU instance profile to partition multi-gpu Nvidia GPUs.
examples:
- name: Create a nodepool in an existing AKS cluster with ephemeral os enabled.
text: az aks nodepool add -g MyResourceGroup -n nodepool1 --cluster-name MyManagedCluster --node-osdisk-type Ephemeral --node-osdisk-size 48
Expand Down
4 changes: 4 additions & 0 deletions src/aks-preview/azext_aks_preview/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
CONST_OS_DISK_TYPE_MANAGED, CONST_OS_DISK_TYPE_EPHEMERAL, \
CONST_RAPID_UPGRADE_CHANNEL, CONST_STABLE_UPGRADE_CHANNEL, CONST_PATCH_UPGRADE_CHANNEL, CONST_NODE_IMAGE_UPGRADE_CHANNEL, CONST_NONE_UPGRADE_CHANNEL, \
CONST_WORKLOAD_RUNTIME_OCI_CONTAINER, CONST_WORKLOAD_RUNTIME_WASM_WASI
from ._consts import CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G, CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G, CONST_GPU_INSTANCE_PROFILE_MIG7_G

workload_runtimes = [CONST_WORKLOAD_RUNTIME_OCI_CONTAINER, CONST_WORKLOAD_RUNTIME_WASM_WASI]
gpu_instance_profiles = [CONST_GPU_INSTANCE_PROFILE_MIG1_G, CONST_GPU_INSTANCE_PROFILE_MIG2_G, CONST_GPU_INSTANCE_PROFILE_MIG3_G, CONST_GPU_INSTANCE_PROFILE_MIG4_G, CONST_GPU_INSTANCE_PROFILE_MIG7_G]


def load_arguments(self, _):
Expand Down Expand Up @@ -143,6 +145,7 @@ def load_arguments(self, _):
c.argument('enable_secret_rotation', action='store_true')
c.argument('assign_kubelet_identity', type=str, validator=validate_assign_kubelet_identity)
c.argument('disable_local_accounts', action='store_true')
c.argument('gpu_instance_profile', arg_type=get_enum_type(gpu_instance_profiles))
c.argument('yes', options_list=['--yes', '-y'], help='Do not prompt for confirmation.', action='store_true')
c.argument('workload_runtime', arg_type=get_enum_type(workload_runtimes), default=CONST_WORKLOAD_RUNTIME_OCI_CONTAINER)

Expand Down Expand Up @@ -238,6 +241,7 @@ def load_arguments(self, _):
c.argument('enable_encryption_at_host', options_list=['--enable-encryption-at-host'], action='store_true')
c.argument('enable_ultra_ssd', action='store_true')
c.argument('workload_runtime', arg_type=get_enum_type(workload_runtimes), default=CONST_WORKLOAD_RUNTIME_OCI_CONTAINER)
c.argument('gpu_instance_profile', arg_type=get_enum_type(gpu_instance_profiles))

for scope in ['aks nodepool show', 'aks nodepool delete', 'aks nodepool scale', 'aks nodepool upgrade', 'aks nodepool update']:
with self.argument_context(scope) as c:
Expand Down
8 changes: 6 additions & 2 deletions src/aks-preview/azext_aks_preview/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,7 @@ def aks_create(cmd, # pylint: disable=too-many-locals,too-many-statements,to
no_wait=False,
assign_kubelet_identity=None,
workload_runtime=None,
gpu_instance_profile=None,
yes=False):
if not no_ssh_key:
try:
Expand Down Expand Up @@ -870,7 +871,8 @@ def aks_create(cmd, # pylint: disable=too-many-locals,too-many-statements,to
enable_ultra_ssd=enable_ultra_ssd,
max_pods=int(max_pods) if max_pods else None,
type=vm_set_type,
workload_runtime=workload_runtime
workload_runtime=workload_runtime,
gpu_instance_profile=gpu_instance_profile
)

if node_osdisk_size:
Expand Down Expand Up @@ -2378,6 +2380,7 @@ def aks_agentpool_add(cmd, # pylint: disable=unused-argument,too-many-local
enable_encryption_at_host=False,
enable_ultra_ssd=False,
workload_runtime=None,
gpu_instance_profile=None,
no_wait=False):
instances = client.list(resource_group_name, cluster_name)
for agentpool_profile in instances:
Expand Down Expand Up @@ -2432,7 +2435,8 @@ def aks_agentpool_add(cmd, # pylint: disable=unused-argument,too-many-local
enable_encryption_at_host=enable_encryption_at_host,
enable_ultra_ssd=enable_ultra_ssd,
mode=mode,
workload_runtime=workload_runtime
workload_runtime=workload_runtime,
gpu_instance_profile=gpu_instance_profile
)

if priority == CONST_SCALE_SET_PRIORITY_SPOT:
Expand Down
Loading