Skip to content

Commit f081a00

Browse files
authored
Slurm job fixes
1 parent c5eb858 commit f081a00

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

src/torchrunx/environment.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def slurm_workers() -> int:
3939
if "SLURM_JOB_GPUS" in os.environ:
4040
# TODO: is it possible to allocate uneven GPUs across nodes?
4141
return len(os.environ["SLURM_JOB_GPUS"].split(","))
42+
elif "SLURM_GPUS_PER_NODE" in os.environ:
43+
return int(os.environ['SLURM_GPUS_PER_NODE'])
4244
else:
4345
# TODO: should we assume that we plan to do one worker per CPU?
4446
return int(os.environ["SLURM_CPUS_ON_NODE"])
@@ -52,7 +54,7 @@ def auto_hosts() -> list[str]:
5254
:rtype: list[str]
5355
"""
5456
if in_slurm_job():
55-
slurm_hosts()
57+
return slurm_hosts()
5658

5759
return ["localhost"]
5860

0 commit comments

Comments
 (0)