Skip to content

Commit

Permalink
[launcher/runner] respect CUDA_VISIBLE_DEVICES for a single node (#960)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
  • Loading branch information
3 people authored Nov 17, 2021
1 parent 938449e commit e3c2d7b
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion deepspeed/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,12 +280,27 @@ def run_autotuning(args, active_resources):
def main(args=None):
args = parse_args(args)

resource_pool = fetch_hostfile(args.hostfile)

# respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
if not resource_pool and len(cuda_visible_devices):
detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
if len(args.include) or len(
args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
print(
f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
)
else:
args.include = f"localhost:{cuda_visible_devices}"
print(f"{detected_str}: setting --include={args.include}")
del os.environ["CUDA_VISIBLE_DEVICES"]

if args.num_nodes >= 0 or args.num_gpus >= 0:
if args.include != "" or args.exclude != "":
raise ValueError("Cannot specify num_nodes/gpus with include/exclude")

multi_node_exec = True
resource_pool = fetch_hostfile(args.hostfile)
if not resource_pool:
resource_pool = {}
device_count = torch.cuda.device_count()
Expand Down

0 comments on commit e3c2d7b

Please sign in to comment.