Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ run_pytorch_container: &run_pytorch_container
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
docker exec -it pthd nvidia-smi
docker exec -it pthd ls
docker exec -it pthd /bin/bash -c "$update_pth_cmd"

run_pytorch_devel_container: &run_pytorch_devel_container
- run:
Expand All @@ -97,7 +96,6 @@ run_pytorch_devel_container: &run_pytorch_devel_container
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >>
docker exec -it pthd nvidia-smi
docker exec -it pthd ls
docker exec -it pthd /bin/bash -c "$update_pth_cmd"

install_dependencies: &install_dependencies
- run:
Expand Down
11 changes: 5 additions & 6 deletions ignite/distributed/comp_models/native.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,11 @@ def _init_from_context(self) -> None:

def _compute_nproc_per_node(self) -> int:
local_rank = self.get_local_rank()
device = torch.device("cpu")
if torch.cuda.is_available():
# we manually set cuda device to local rank in order to avoid a hang on all_reduce
device = torch.device(f"cuda:{local_rank}")
tensor = torch.tensor([self.get_local_rank() + 1]).to(device)
dist.all_reduce(tensor, op=dist.ReduceOp.MAX)
# Create new cpu group to get nproc_per_node such we avoid using
# badly configured NCCL
gloo_group = dist.new_group(backend="gloo")
tensor = torch.tensor([local_rank + 1]).to("cpu")
dist.all_reduce(tensor, op=dist.ReduceOp.MAX, group=gloo_group)
return int(tensor.item())

def _get_all_hostnames(self) -> List[Tuple[str, ...]]:
Expand Down