-
Notifications
You must be signed in to change notification settings - Fork 75
/
main.py
60 lines (44 loc) · 1.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
from pathlib import Path
from typing import List, Union
import hydra
from omegaconf import DictConfig, OmegaConf
import torch
from torch.distributed import init_process_group, destroy_process_group
import torch.multiprocessing as mp
from trainer import Trainer
from utils import skip_if_run_is_over
OmegaConf.register_new_resolver("eval", eval)
@hydra.main(config_path="../config", config_name="trainer", version_base="1.3")
def main(cfg: DictConfig) -> None:
setup_visible_cuda_devices(cfg.common.devices)
world_size = torch.cuda.device_count()
root_dir = Path(hydra.utils.get_original_cwd())
if world_size < 2:
run(cfg, root_dir)
else:
mp.spawn(main_ddp, args=(world_size, cfg, root_dir), nprocs=world_size)
def main_ddp(rank: int, world_size: int, cfg: DictConfig, root_dir: Path) -> None:
setup_ddp(rank, world_size)
run(cfg, root_dir)
destroy_process_group()
@skip_if_run_is_over
def run(cfg: DictConfig, root_dir: Path) -> None:
trainer = Trainer(cfg, root_dir)
trainer.run()
def setup_ddp(rank: int, world_size: int) -> None:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "6006"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
def setup_visible_cuda_devices(devices: Union[str, int, List[int]]) -> None:
if isinstance(devices, str):
if devices == "cpu":
devices = []
else:
assert devices == "all"
return
elif isinstance(devices, int):
devices = [devices]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, devices))
if __name__ == "__main__":
main()