Skip to content

Commit

Permalink
support AMD and Intel GPUs
Browse files Browse the repository at this point in the history
  • Loading branch information
zubenkoivan committed Apr 29, 2024
1 parent afcb476 commit 742c6f1
Show file tree
Hide file tree
Showing 34 changed files with 1,470 additions and 474 deletions.
12 changes: 8 additions & 4 deletions CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -680,12 +680,14 @@ neuro admin add-resource-preset [OPTIONS] PRESET_NAME
Name | Description|
|----|------------|
|_--help_|Show this message and exit.|
|_\--amd-gpu NUMBER_|Number of AMD GPUs|
|_\-c, --cpu NUMBER_|Number of CPUs \[default: 0.1]|
|_\--credits-per-hour AMOUNT_|Price of running job of this preset for an hour in credits \[default: 0]|
|_\-g, --gpu NUMBER_|Number of GPUs|
|_\--gpu-model MODEL_|GPU model|
|_\--intel-gpu NUMBER_|Number of Intel GPUs|
|_\-m, --memory AMOUNT_|Memory amount \[default: 1GB]|
|_\-g, --nvidia-gpu NUMBER_|Number of Nvidia GPUs|
|_\--preemptible-node / --non-preemptible-node_|Use a lower\-cost preemptible instance \[default: non-preemptible-node]|
|_\-r, --resource-pool TEXT_|Name of the resource pool where job will be scheduled \(multiple values are supported)|
|_\-p, --scheduler / -P, --no-scheduler_|Use round robin scheduler for jobs \[default: no-scheduler]|
|_\--tpu-sw-version VERSION_|TPU software version|
|_\--tpu-type TYPE_|TPU type|
Expand Down Expand Up @@ -1318,12 +1320,14 @@ neuro admin update-resource-preset [OPTIONS] PRESET_NAME
Name | Description|
|----|------------|
|_--help_|Show this message and exit.|
|_\--amd-gpu NUMBER_|Number of AMD GPUs|
|_\-c, --cpu NUMBER_|Number of CPUs|
|_\--credits-per-hour AMOUNT_|Price of running job of this preset for an hour in credits|
|_\-g, --gpu NUMBER_|Number of GPUs|
|_\--gpu-model MODEL_|GPU model|
|_\--intel-gpu NUMBER_|Number of Intel GPUs|
|_\-m, --memory AMOUNT_|Memory amount|
|_\-g, --nvidia-gpu NUMBER_|Number of Nvidia GPUs|
|_\--preemptible-node / --non-preemptible-node_|Use a lower-cost preemptible instance|
|_\-r, --resource-pool TEXT_|Name of the resource pool where job will be scheduled \(multiple values are supported)|
|_\-p, --scheduler / -P, --no-scheduler_|Use round robin scheduler for jobs|
|_\--tpu-sw-version VERSION_|TPU software version|
|_\--tpu-type TYPE_|TPU type|
Expand Down
12 changes: 8 additions & 4 deletions neuro-cli/docs/admin.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,14 @@ Add new resource preset
| Name | Description |
| :--- | :--- |
| _--help_ | Show this message and exit. |
| _--amd-gpu NUMBER_ | Number of AMD GPUs |
| _-c, --cpu NUMBER_ | Number of CPUs _\[default: 0.1\]_ |
| _--credits-per-hour AMOUNT_ | Price of running job of this preset for an hour in credits _\[default: 0\]_ |
| _-g, --gpu NUMBER_ | Number of GPUs |
| _--gpu-model MODEL_ | GPU model |
| _--intel-gpu NUMBER_ | Number of Intel GPUs |
| _-m, --memory AMOUNT_ | Memory amount _\[default: 1GB\]_ |
| _-g, --nvidia-gpu NUMBER_ | Number of Nvidia GPUs |
| _--preemptible-node / --non-preemptible-node_ | Use a lower-cost preemptible instance _\[default: non-preemptible-node\]_ |
| _-r, --resource-pool TEXT_ | Name of the resource pool where job will be scheduled \(multiple values are supported\) |
| _-p, --scheduler / -P, --no-scheduler_ | Use round robin scheduler for jobs _\[default: no-scheduler\]_ |
| _--tpu-sw-version VERSION_ | TPU software version |
| _--tpu-type TYPE_ | TPU type |
Expand Down Expand Up @@ -980,12 +982,14 @@ Update existing resource preset
| Name | Description |
| :--- | :--- |
| _--help_ | Show this message and exit. |
| _--amd-gpu NUMBER_ | Number of AMD GPUs |
| _-c, --cpu NUMBER_ | Number of CPUs |
| _--credits-per-hour AMOUNT_ | Price of running job of this preset for an hour in credits |
| _-g, --gpu NUMBER_ | Number of GPUs |
| _--gpu-model MODEL_ | GPU model |
| _--intel-gpu NUMBER_ | Number of Intel GPUs |
| _-m, --memory AMOUNT_ | Memory amount |
| _-g, --nvidia-gpu NUMBER_ | Number of Nvidia GPUs |
| _--preemptible-node / --non-preemptible-node_ | Use a lower-cost preemptible instance |
| _-r, --resource-pool TEXT_ | Name of the resource pool where job will be scheduled \(multiple values are supported\) |
| _-p, --scheduler / -P, --no-scheduler_ | Use round robin scheduler for jobs |
| _--tpu-sw-version VERSION_ | TPU software version |
| _--tpu-type TYPE_ | TPU type |
Expand Down
88 changes: 66 additions & 22 deletions neuro-cli/src/neuro_cli/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pathlib
from dataclasses import replace
from decimal import Decimal, InvalidOperation
from typing import IO, Dict, Optional, Tuple
from typing import IO, Any, Dict, Optional, Sequence, Tuple

import click
import yaml
Expand Down Expand Up @@ -955,15 +955,22 @@ async def add_user_credits(
)
@option(
"-g",
"--gpu",
"--nvidia-gpu",
metavar="NUMBER",
type=int,
help="Number of GPUs",
help="Number of Nvidia GPUs",
)
@option(
"--gpu-model",
metavar="MODEL",
help="GPU model",
"--amd-gpu",
metavar="NUMBER",
type=int,
help="Number of AMD GPUs",
)
@option(
"--intel-gpu",
metavar="NUMBER",
type=int,
help="Number of Intel GPUs",
)
@option("--tpu-type", metavar="TYPE", type=str, help="TPU type")
@option(
Expand All @@ -986,18 +993,30 @@ async def add_user_credits(
default=False,
show_default=True,
)
@option(
"resource_pool_names",
"-r",
"--resource-pool",
help=(
"Name of the resource pool where job will be scheduled "
"(multiple values are supported)"
),
multiple=True,
)
async def add_resource_preset(
root: Root,
preset_name: str,
credits_per_hour: str,
cpu: float,
memory: int,
gpu: Optional[int],
gpu_model: Optional[str],
nvidia_gpu: Optional[int],
amd_gpu: Optional[int],
intel_gpu: Optional[int],
tpu_type: Optional[str],
tpu_software_version: Optional[str],
scheduler: bool,
preemptible_node: bool,
resource_pool_names: Sequence[str],
) -> None:
"""
Add new resource preset
Expand All @@ -1014,11 +1033,13 @@ async def add_resource_preset(
credits_per_hour=_parse_finite_decimal(credits_per_hour),
cpu=cpu,
memory=memory,
gpu=gpu,
gpu_model=gpu_model,
nvidia_gpu=nvidia_gpu,
amd_gpu=amd_gpu,
intel_gpu=intel_gpu,
tpu=tpu_preset,
scheduler_enabled=scheduler,
preemptible_node=preemptible_node,
resource_pool_names=resource_pool_names,
)
await root.client._clusters.add_resource_preset(
root.client.config.cluster_name, preset
Expand Down Expand Up @@ -1056,15 +1077,22 @@ async def add_resource_preset(
)
@option(
"-g",
"--gpu",
"--nvidia-gpu",
metavar="NUMBER",
type=int,
help="Number of GPUs",
help="Number of Nvidia GPUs",
)
@option(
"--gpu-model",
metavar="MODEL",
help="GPU model",
"--amd-gpu",
metavar="NUMBER",
type=int,
help="Number of AMD GPUs",
)
@option(
"--intel-gpu",
metavar="NUMBER",
type=int,
help="Number of Intel GPUs",
)
@option("--tpu-type", metavar="TYPE", type=str, help="TPU type")
@option(
Expand All @@ -1085,18 +1113,30 @@ async def add_resource_preset(
help="Use a lower-cost preemptible instance",
default=None,
)
@option(
"resource_pool_names",
"-r",
"--resource-pool",
help=(
"Name of the resource pool where job will be scheduled "
"(multiple values are supported)"
),
multiple=True,
)
async def update_resource_preset(
root: Root,
preset_name: str,
credits_per_hour: Optional[str],
cpu: Optional[float],
memory: Optional[int],
gpu: Optional[int],
gpu_model: Optional[str],
nvidia_gpu: Optional[int],
amd_gpu: Optional[int],
intel_gpu: Optional[int],
tpu_type: Optional[str],
tpu_software_version: Optional[str],
scheduler: Optional[bool],
preemptible_node: Optional[bool],
resource_pool_names: Sequence[str],
) -> None:
"""
Update existing resource preset
Expand All @@ -1107,18 +1147,20 @@ async def update_resource_preset(
except KeyError:
raise ValueError(f"Preset '{preset_name}' does not exists")

kwargs = {
kwargs: Dict[str, Any] = {
"credits_per_hour": _parse_finite_decimal(credits_per_hour)
if credits_per_hour is not None
else None,
"cpu": cpu,
"memory": memory,
"gpu": gpu,
"gpu_model": gpu_model,
"nvidia_gpu": nvidia_gpu,
"amd_gpu": amd_gpu,
"intel_gpu": intel_gpu,
"tpu_type": tpu_type,
"tpu_software_version": tpu_software_version,
"scheduler_enabled": scheduler,
"preemptible_node": preemptible_node,
"resource_pool_names": resource_pool_names,
}
kwargs = {key: value for key, value in kwargs.items() if value is not None}
preset = replace(preset, **kwargs)
Expand All @@ -1135,11 +1177,13 @@ async def update_resource_preset(
credits_per_hour=preset.credits_per_hour,
cpu=preset.cpu,
memory=preset.memory,
gpu=preset.gpu,
gpu_model=preset.gpu_model,
nvidia_gpu=preset.nvidia_gpu,
amd_gpu=preset.amd_gpu,
intel_gpu=preset.intel_gpu,
tpu=tpu_preset,
scheduler_enabled=preset.scheduler_enabled,
preemptible_node=preset.preemptible_node,
resource_pool_names=preset.resource_pool_names,
),
)
await root.client.config.fetch()
Expand Down
50 changes: 31 additions & 19 deletions neuro-cli/src/neuro_cli/formatters/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,11 @@ def __call__(
name: Union[str, Text] = cluster.name or ""
pre = " "
org_names: List[Text] = [
Text(org or OrgType.NO_ORG_STR, style="u")
if org == default_org and cluster.name == default_cluster
else Text(org or OrgType.NO_ORG_STR)
(
Text(org or OrgType.NO_ORG_STR, style="u")
if org == default_org and cluster.name == default_cluster
else Text(org or OrgType.NO_ORG_STR)
)
for org in cluster.orgs
]
if cluster.name == default_cluster:
Expand All @@ -108,11 +110,10 @@ def _format_presets(
presets: Mapping[str, Preset],
available_jobs_counts: Optional[Mapping[str, int]],
) -> Table:
has_tpu = False
for preset in presets.values():
if preset.tpu_type:
has_tpu = True
break
has_nvidia_gpu = any(p.nvidia_gpu for p in presets.values())
has_amd_gpu = any(p.amd_gpu for p in presets.values())
has_intel_gpu = any(p.intel_gpu for p in presets.values())
has_tpu = any(p.tpu_type for p in presets.values())

table = Table(
title="Resource Presets:",
Expand All @@ -125,37 +126,48 @@ def _format_presets(
table.add_column("Memory", justify="right")
table.add_column("Round Robin", justify="center")
table.add_column("Preemptible Node", justify="center")
table.add_column("GPU", justify="left")
if available_jobs_counts:
table.add_column("Jobs Avail", justify="right")
if has_nvidia_gpu:
table.add_column("Nvidia GPU", justify="center")
if has_amd_gpu:
table.add_column("AMD GPU", justify="center")
if has_intel_gpu:
table.add_column("Intel GPU", justify="center")
if has_tpu:
table.add_column("TPU", justify="left")
table.add_column("Resource Pools", justify="left")

if available_jobs_counts:
table.add_column("Jobs Avail", justify="right")
table.add_column("Credits per hour", justify="left")

for name, preset in presets.items():
gpu = ""
if preset.gpu:
gpu = f"{preset.gpu} x {preset.gpu_model}"
row = [
name,
str(preset.cpu),
format_size(preset.memory),
"√" if preset.scheduler_enabled else "×",
"√" if preset.preemptible_node else "×",
gpu,
]
if has_nvidia_gpu:
row.append(str(preset.nvidia_gpu) if preset.nvidia_gpu else "")
if has_amd_gpu:
row.append(str(preset.amd_gpu) if preset.amd_gpu else "")
if has_intel_gpu:
row.append(str(preset.intel_gpu) if preset.intel_gpu else "")
if has_tpu:
tpu = (
f"{preset.tpu_type}/{preset.tpu_software_version}"
if preset.tpu_type
else ""
)
row.append(tpu)
row.append("\n".join(preset.resource_pool_names))
if available_jobs_counts:
if name in available_jobs_counts:
row.append(str(available_jobs_counts[name]))
else:
row.append("")
row.append(
str(available_jobs_counts[name])
if name in available_jobs_counts
else ""
)
row.append(str(preset.credits_per_hour))
table.add_row(*row)

Expand Down
14 changes: 10 additions & 4 deletions neuro-cli/src/neuro_cli/formatters/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,17 @@ def __call__(self, job_status: JobDescription) -> RenderableType:
resources.add_column(style="bold", justify="right")
resources.add_row("Memory", format_size(job_status.container.resources.memory))
resources.add_row("CPU", f"{job_status.container.resources.cpu:0.1f}")
if job_status.container.resources.gpu:
if job_status.container.resources.nvidia_gpu:
resources.add_row(
"GPU",
f"{job_status.container.resources.gpu:0.1f} x "
f"{job_status.container.resources.gpu_model}",
"Nvidia GPU", f"{job_status.container.resources.nvidia_gpu:0.1f}"
)
if job_status.container.resources.amd_gpu:
resources.add_row(
"AMD GPU", f"{job_status.container.resources.amd_gpu:0.1f}"
)
if job_status.container.resources.intel_gpu:
resources.add_row(
"Intel GPU", f"{job_status.container.resources.intel_gpu:0.1f}"
)

if job_status.container.resources.tpu_type:
Expand Down
Loading

0 comments on commit 742c6f1

Please sign in to comment.