Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add MLU devices for rng state saving and loading. #2940

Merged
merged 36 commits into from
Jul 31, 2024
Merged
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
bc5ccfb
Add Cambricon MLU accelerator support
huismiling Mar 13, 2024
3ad38dc
up mlu support for test
huismiling Mar 13, 2024
be32c91
fix mlu device MULTI_MLU
huismiling Mar 13, 2024
421c142
Update src/accelerate/utils/imports.py
huismiling Mar 14, 2024
78cd1cb
up mlu for quality check
huismiling Mar 14, 2024
3abd038
fix mlu device longTensor error
huismiling Mar 15, 2024
0542987
fix mlu device tensor dtype check
huismiling Mar 19, 2024
e024276
fix mlu device send_to_device with torch dynamo error
huismiling Mar 19, 2024
a50b0d9
Refactor AcceleratorState
muellerzr Mar 21, 2024
ff628be
Should be near complete now
muellerzr Mar 21, 2024
a1aac83
Last missing piece
muellerzr Mar 21, 2024
31ea8cc
Make my way to the acceleratorstate
muellerzr Mar 21, 2024
47b60ca
Include update to global var
muellerzr Mar 21, 2024
2082a9a
Don't use global
muellerzr Mar 21, 2024
26c484e
gpu -> cuda
muellerzr Mar 21, 2024
5ac5d56
Don't use update for dict, easier to read
muellerzr Mar 21, 2024
2baa5c3
Fix tests
muellerzr Mar 21, 2024
d709f66
stash
muellerzr Mar 21, 2024
ac24315
Getting closer...
muellerzr Mar 21, 2024
1628898
Needed to spawn at the very end after env was setup
muellerzr Mar 21, 2024
6958e1b
Explain set_device before deepspeed
muellerzr Mar 22, 2024
2b9d339
Make docstring more accurate
muellerzr Mar 22, 2024
194db93
Early return insteaD
muellerzr Mar 22, 2024
31201d3
Delineat blocks
muellerzr Mar 22, 2024
eef1aa0
Make prepare_backend return state + backend for clarity/less magic
muellerzr Mar 22, 2024
37d0edc
Merge branch 'huggingface:main' into main
huismiling Mar 25, 2024
0fc1df3
Merge remote-tracking branch 'hf-acc/refactor-state'
huismiling Mar 25, 2024
b09003c
Merge branch 'huggingface:main' into main
huismiling May 8, 2024
92dc4bc
merge from hf
huismiling May 8, 2024
124331a
fix mlu longtensor.to() bugs.
huismiling May 8, 2024
36f35e8
Merge branch 'huggingface:main' into main
huismiling May 20, 2024
48d2c0c
Merge branch 'huggingface:main' into main
huismiling May 23, 2024
900efd0
Merge branch 'huggingface:main' into main
huismiling May 29, 2024
b3a1aed
Merge branch 'huggingface:main' into main
huismiling Jun 25, 2024
ef86bf2
Merge branch 'huggingface:main' into main
huismiling Jul 18, 2024
012f7a3
fix MLU devices rng state save and load.
huismiling Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Should be near complete now
  • Loading branch information
muellerzr committed Mar 22, 2024
commit ff628be71ee2091fbcc516e4607492cd72c15615
34 changes: 34 additions & 0 deletions src/accelerate/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

import torch

from accelerate.utils.imports import is_deepspeed_available

from .utils import (
DistributedType,
DynamoBackend,
Expand Down Expand Up @@ -158,6 +160,8 @@ def __init__(self, cpu: bool = False, **kwargs):
if not cpu:
# Deal with XLA
if is_torch_xla_available():
self.device = xm.xla_device()
xm.set_replication(self.device, xm.get_xla_supported_devices())
self.num_processes = xm.xrt_world_size()
self.process_index = xm.get_ordinal()
if is_torch_xla_available(check_is_tpu=True):
Expand All @@ -168,6 +172,10 @@ def __init__(self, cpu: bool = False, **kwargs):
if not torch.distributed.is_initialized():
if DISTRIBUTED_LOCAL_RANK:
if USE_DEEPSPEED:
if not is_deepspeed_available():
raise ImportError(
"DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source"
)
from deepspeed import comm as dist

if is_xpu_available and is_ccl_available():
Expand All @@ -179,6 +187,7 @@ def __init__(self, cpu: bool = False, **kwargs):
}
)
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
self.set_device()
self.distributed_type = DistributedType.DEEPSPEED
# Deal with all backends but XPU and CPU, that gets handled special later
elif self.distributed_type not in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU):
Expand Down Expand Up @@ -250,6 +259,7 @@ def __init__(self, cpu: bool = False, **kwargs):
self.num_processes = torch.distributed.get_world_size()
self.process_index = torch.distributed.get_rank()
self.local_process_index = DISTRIBUTED_LOCAL_RANK
self.set_device()

self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)

Expand Down Expand Up @@ -722,6 +732,30 @@ def _prepare_backend(self, cpu: bool = False, sagemaker_dp=False, backend: str =
else:
self.backend = "gloo"

def set_device(self):
"""
Sets the device in `self.device` to the current distributed environment.
"""
if self.device is None:
if self.num_processes == 1:
self.device = torch.device("cpu") if self._cpu else self.default_device
else:
device = str(self.distributed_type).replace("MULTI_", "").lower()
if device not in ("gpu", "mlu", "npu", "xpu"):
raise ValueError(
f"Can't set device for {self.distributed_type}, verify we should be calling `_set_device()` for it!"
)
self.device = torch.device(device, self.local_process_index)
if self.device is not None:
if device == "xpu":
torch.xpu.set_device(self.device)
elif device == "mlu":
torch.mlu.set_device(self.device)
elif device == "npu":
torch.npu.set_device(self.device)
else:
torch.cuda.set_device(self.device)


class AcceleratorState:
"""
Expand Down