You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
component, remaining_args = _CallAndUpdateTrace(component, remaining_args = _CallAndUpdateTrace(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component = fn(*varargs, **kwargs)
component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component = fn(*varargs, **kwargs)
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
assert world_size == len(
AssertionError: Loading a checkpoint for MP=1 but world size is 4
assert world_size == len(component, remaining_args = _CallAndUpdateTrace(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
AssertionError: Loading a checkpoint for MP=1 but world size is 4
component = fn(*varargs, **kwargs)
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
assert world_size == len(
AssertionError: Loading a checkpoint for MP=1 but world size is 4
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
assert world_size == len(
AssertionError: Loading a checkpoint for MP=1 but world size is 4
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 8748) of binary: /data1/anaconda3/envs/llama_py310/bin/python3.1
Traceback (most recent call last):
File "/data1/anaconda3/envs/llama_py310/bin/torchrun", line 8, in
sys.exit(main())
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
The text was updated successfully, but these errors were encountered:
""" AssertionError: Loading a checkpoint for MP=1 but world size is 4
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 8748) of binary: /data1/anaconda3/envs/llama_py310/bin/python3."""
I ran into this issue on Apple M2 Max. My nproc_per_node is 1 and prominent error messages:
File "/Users//Developer/python39_env/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/Users//Developer/python39_env/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 998, in _new_process_group_helper
raise RuntimeError("Distributed package doesn't have NCCL " "built in")
File "/Users//Developer/python39_env/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/Users//Developer/python39_env/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
torchrun --nproc_per_node gpu example.py --ckpt_dir pyllama_data/7B --tokenizer_path pyllama_data/tokenizer.model
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
component, remaining_args = _CallAndUpdateTrace(component, remaining_args = _CallAndUpdateTrace(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component = fn(*varargs, **kwargs)
component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component = fn(*varargs, **kwargs)
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
assert world_size == len(
AssertionError: Loading a checkpoint for MP=1 but world size is 4
assert world_size == len(component, remaining_args = _CallAndUpdateTrace(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
AssertionError: Loading a checkpoint for MP=1 but world size is 4
component = fn(*varargs, **kwargs)
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
assert world_size == len(
AssertionError: Loading a checkpoint for MP=1 but world size is 4
Traceback (most recent call last):
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 119, in
fire.Fire(main)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 78, in main
generator = load(
File "/data1/Semantic_team/speech/chatgpt/llama/example.py", line 42, in load
assert world_size == len(
AssertionError: Loading a checkpoint for MP=1 but world size is 4
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 8748) of binary: /data1/anaconda3/envs/llama_py310/bin/python3.1
Traceback (most recent call last):
File "/data1/anaconda3/envs/llama_py310/bin/torchrun", line 8, in
sys.exit(main())
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/data1/anaconda3/envs/llama_py310/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
The text was updated successfully, but these errors were encountered: