Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add MLU devices for rng state saving and loading. #2940

Merged
merged 36 commits into from
Jul 31, 2024
Merged
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
bc5ccfb
Add Cambricon MLU accelerator support
huismiling Mar 13, 2024
3ad38dc
up mlu support for test
huismiling Mar 13, 2024
be32c91
fix mlu device MULTI_MLU
huismiling Mar 13, 2024
421c142
Update src/accelerate/utils/imports.py
huismiling Mar 14, 2024
78cd1cb
up mlu for quality check
huismiling Mar 14, 2024
3abd038
fix mlu device longTensor error
huismiling Mar 15, 2024
0542987
fix mlu device tensor dtype check
huismiling Mar 19, 2024
e024276
fix mlu device send_to_device with torch dynamo error
huismiling Mar 19, 2024
a50b0d9
Refactor AcceleratorState
muellerzr Mar 21, 2024
ff628be
Should be near complete now
muellerzr Mar 21, 2024
a1aac83
Last missing piece
muellerzr Mar 21, 2024
31ea8cc
Make my way to the acceleratorstate
muellerzr Mar 21, 2024
47b60ca
Include update to global var
muellerzr Mar 21, 2024
2082a9a
Don't use global
muellerzr Mar 21, 2024
26c484e
gpu -> cuda
muellerzr Mar 21, 2024
5ac5d56
Don't use update for dict, easier to read
muellerzr Mar 21, 2024
2baa5c3
Fix tests
muellerzr Mar 21, 2024
d709f66
stash
muellerzr Mar 21, 2024
ac24315
Getting closer...
muellerzr Mar 21, 2024
1628898
Needed to spawn at the very end after env was setup
muellerzr Mar 21, 2024
6958e1b
Explain set_device before deepspeed
muellerzr Mar 22, 2024
2b9d339
Make docstring more accurate
muellerzr Mar 22, 2024
194db93
Early return insteaD
muellerzr Mar 22, 2024
31201d3
Delineat blocks
muellerzr Mar 22, 2024
eef1aa0
Make prepare_backend return state + backend for clarity/less magic
muellerzr Mar 22, 2024
37d0edc
Merge branch 'huggingface:main' into main
huismiling Mar 25, 2024
0fc1df3
Merge remote-tracking branch 'hf-acc/refactor-state'
huismiling Mar 25, 2024
b09003c
Merge branch 'huggingface:main' into main
huismiling May 8, 2024
92dc4bc
merge from hf
huismiling May 8, 2024
124331a
fix mlu longtensor.to() bugs.
huismiling May 8, 2024
36f35e8
Merge branch 'huggingface:main' into main
huismiling May 20, 2024
48d2c0c
Merge branch 'huggingface:main' into main
huismiling May 23, 2024
900efd0
Merge branch 'huggingface:main' into main
huismiling May 29, 2024
b3a1aed
Merge branch 'huggingface:main' into main
huismiling Jun 25, 2024
ef86bf2
Merge branch 'huggingface:main' into main
huismiling Jul 18, 2024
012f7a3
fix MLU devices rng state save and load.
huismiling Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Needed to spawn at the very end after env was setup
  • Loading branch information
muellerzr committed Mar 22, 2024
commit 16288982cc0826efbcd27e9e4f7b2a0d45ad4d3d
83 changes: 40 additions & 43 deletions src/accelerate/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,48 +186,47 @@ def __init__(self, cpu: bool = False, **kwargs):
torch.distributed.init_process_group(backend=self.backend, **kwargs)
# XPU and CPU require special env configs to be set
if self.distributed_type in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU):
if not torch.distributed.is_initialized():
raise ValueError()
dist_information = get_cpu_distributed_information()
os.environ["RANK"] = str(dist_information["rank"])
os.environ["WORLD_SIZE"] = str(dist_information["world_size"])
os.environ["LOCAL_RANK"] = str(dist_information["local_rank"])
os.environ["LOCAL_WORLD_SIZE"] = str(dist_information["local_world_size"])
if self.backend == "ccl" and self.distributed_type == DistributedType.MULTI_XPU:
os.environ["CCL_PROCESS_LAUNCHER"] = "none"
os.environ["CCL_LOCAL_SIZE"] = os.environ["LOCAL_WORLD_SIZE"]
os.environ["CCL_LOCAL_RANK"] = os.environ["LOCAL_RANK"]
if not os.environ.get("MASTER_PORT", None):
os.environ["MASTER_PORT"] = "29500"
if (
not os.environ.get("MASTER_ADDR", None)
and dist_information["local_world_size"] != dist_information["world_size"]
and self.backend != "mpi"
):
raise ValueError(
"Tried to launch on distributed with multinode, but `MASTER_ADDR` env was not set, "
"please try exporting rank 0's hostname as `MASTER_ADDR`"
)
kwargs["rank"] = dist_information["rank"]
kwargs["world_size"] = dist_information["world_size"]

if (
self.distributed_type == DistributedType.MULTI_CPU
and get_int_from_env(["OMP_NUM_THREADS", "OMP_NUM_THREADS"], 0) > 0
):
import psutil

num_cpu_threads_per_process = int(
psutil.cpu_count(logical=False) / dist_information["local_world_size"]
)
if num_cpu_threads_per_process == 0:
num_cpu_threads_per_process = 1
torch.set_num_threads(num_cpu_threads_per_process)
warnings.warn(
f"OMP_NUM_THREADS/MKL_NUM_THREADS unset, we set it at {num_cpu_threads_per_process} to improve oob"
" performance."
)
dist_information = get_cpu_distributed_information()
os.environ["RANK"] = str(dist_information["rank"])
os.environ["WORLD_SIZE"] = str(dist_information["world_size"])
os.environ["LOCAL_RANK"] = str(dist_information["local_rank"])
os.environ["LOCAL_WORLD_SIZE"] = str(dist_information["local_world_size"])
if self.backend == "ccl" and self.distributed_type == DistributedType.MULTI_XPU:
os.environ["CCL_PROCESS_LAUNCHER"] = "none"
os.environ["CCL_LOCAL_SIZE"] = os.environ["LOCAL_WORLD_SIZE"]
os.environ["CCL_LOCAL_RANK"] = os.environ["LOCAL_RANK"]
if not os.environ.get("MASTER_PORT", None):
os.environ["MASTER_PORT"] = "29500"
if (
not os.environ.get("MASTER_ADDR", None)
and dist_information["local_world_size"] != dist_information["world_size"]
and self.backend != "mpi"
):
raise ValueError(
"Tried to launch on distributed with multinode, but `MASTER_ADDR` env was not set, "
"please try exporting rank 0's hostname as `MASTER_ADDR`"
)
kwargs["rank"] = dist_information["rank"]
kwargs["world_size"] = dist_information["world_size"]

if (
self.distributed_type == DistributedType.MULTI_CPU
and get_int_from_env(["OMP_NUM_THREADS", "OMP_NUM_THREADS"], 0) > 0
):
import psutil

num_cpu_threads_per_process = int(
psutil.cpu_count(logical=False) / dist_information["local_world_size"]
)
if num_cpu_threads_per_process == 0:
num_cpu_threads_per_process = 1
torch.set_num_threads(num_cpu_threads_per_process)
warnings.warn(
f"OMP_NUM_THREADS/MKL_NUM_THREADS unset, we set it at {num_cpu_threads_per_process} to improve oob"
" performance."
)

if not torch.distributed.is_initialized():
torch.distributed.init_process_group(backend=self.backend, **kwargs)

# No backend == no distributed training
Expand Down Expand Up @@ -275,7 +274,6 @@ def __init__(self, cpu: bool = False, **kwargs):
affinity_list.reverse() # so core 0 is the 0th element
affinity_to_set = [i for i, e in enumerate(affinity_list) if e != 0]
os.sched_setaffinity(0, affinity_to_set)
raise ValueError(self.backend, self.distributed_type, self.device)

def __repr__(self) -> str:
return (
Expand Down Expand Up @@ -728,7 +726,6 @@ def _prepare_backend(self, cpu: bool = False, sagemaker_dp=False, backend: str =
self.backend = "gloo"
else:
self.distributed_type = DistributedType.NO
raise ValueError(self.backend, self.distributed_type, self.device)

def set_device(self):
"""
Expand Down
Loading