You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[2024-08-14 21:57:58,409][root][INFO] - rank: 3, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,412][root][INFO] - rank: 5, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,413][root][INFO] - rank: 1, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,414][root][INFO] - rank: 4, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,414][root][INFO] - rank: 0, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,417][root][INFO] - rank: 7, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,418][root][INFO] - rank: 2, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,419][root][INFO] - rank: 6, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:58:59,814][root][INFO] - rank: 5, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:02,055][root][INFO] - rank: 3, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:02,374][root][INFO] - rank: 6, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:03,529][root][INFO] - rank: 4, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:03,584][root][INFO] - rank: 1, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:05,045][root][INFO] - rank: 0, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:05,611][root][INFO] - rank: 2, dataloader start from step: 0, batch_num: 65, after: 65
[rank7]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 270, in
[rank7]: main_hydra()
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/main.py", line 94, in decorated_main
[rank7]: _run_hydra(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank7]: _run_app(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank7]: run_and_report(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank7]: raise ex
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank7]: return func()
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 458, in
[rank7]: lambda: hydra.run(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank7]: _ = ret.return_value
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
[rank7]: raise self._return_value
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
[rank7]: ret.return_value = task_function(task_cfg)
[rank7]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 53, in main_hydra
[rank7]: main(**kwargs)
[rank7]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 215, in main
[rank7]: trainer.train_epoch(
[rank7]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 499, in train_epoch
[rank7]: self.validate_epoch(
[rank7]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 562, in validate_epoch
[rank7]: for batch_idx, batch in enumerate(dataloader_val):
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 439, in iter
[rank7]: return self._get_iterator()
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 387, in _get_iterator
[rank7]: return _MultiProcessingDataLoaderIter(self)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1085, in init
[rank7]: self._reset(loader, first_iter=True)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1088, in _reset
[rank7]: super()._reset(loader, first_iter)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 611, in _reset
[rank7]: self._sampler_iter = iter(self._index_sampler)
[rank7]: File "/lxc-data/FunASR/funasr/datasets/audio_datasets/samplers.py", line 400, in iter
[rank7]: potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
[rank7]: _error_if_any_worker_fails()
[rank7]: RuntimeError: DataLoader worker (pid 6044) is killed by signal: Killed.
Error executing job with overrides: ['++model=/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/modelscope_models/iic/speech_paraformer-large_big_asr_nat-zh-cn-16k-common-vocab8404-pytorch', '++train_data_set_list=../../../data/10w_list/15Wdata_nochat_5s_train.jsonl', '++valid_data_set_list=../../../data/10w_list/2Wdata_nochat_test.jsonl', '++dataset=AudioDataset', '++dataset_conf.index_ds=IndexDSJsonl', '++dataset_conf.data_split_num=5', '++dataset_conf.batch_sampler=BatchSampler', '++dataset_conf.batch_size=60000', '++dataset_conf.sort_size=1024', '++dataset_conf.batch_type=token', '++dataset_conf.num_workers=12', '++dataset_conf.max_token_length=1024', '++train_conf.max_epoch=500', '++train_conf.log_interval=100', '++train_conf.resume=true', '++train_conf.validate_interval=4000', '++train_conf.save_checkpoint_interval=4000', '++train_conf.keep_nbest_models=20', '++train_conf.avg_nbest_model=10', '++train_conf.use_deepspeed=false', '++train_conf.deepspeed_config=', '++optim_conf.lr=0.0006', '++output_dir=./15Wdata_train_big_large']
[rank1]: Traceback (most recent call last):
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1133, in _try_get_data
[rank1]: data = self._data_queue.get(timeout=timeout)
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/queue.py", line 180, in get
[rank1]: self.not_empty.wait(remaining)
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/threading.py", line 316, in wait
[rank1]: gotit = waiter.acquire(True, timeout)
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
[rank1]: _error_if_any_worker_fails()
[rank1]: RuntimeError: DataLoader worker (pid 6070) is killed by signal: Killed.
[rank1]: The above exception was the direct cause of the following exception:
[rank1]: Traceback (most recent call last):
[rank1]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 270, in
[rank1]: main_hydra()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/main.py", line 94, in decorated_main
[rank1]: _run_hydra(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank1]: _run_app(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank1]: run_and_report(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank1]: raise ex
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank1]: return func()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 458, in
[rank1]: lambda: hydra.run(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank1]: _ = ret.return_value
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
[rank1]: raise self._return_value
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
[rank1]: ret.return_value = task_function(task_cfg)
[rank1]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 53, in main_hydra
[rank1]: main(**kwargs)
[rank1]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 215, in main
[rank1]: trainer.train_epoch(
[rank1]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 499, in train_epoch
[rank1]: self.validate_epoch(
[rank1]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 562, in validate_epoch
[rank1]: for batch_idx, batch in enumerate(dataloader_val):
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 631, in next
[rank1]: data = self._next_data()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1329, in _next_data
[rank1]: idx, data = self._get_data()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1285, in _get_data
[rank1]: success, data = self._try_get_data()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1146, in _try_get_data
[rank1]: raise RuntimeError(f'DataLoader worker (pid(s) {pids_str}) exited unexpectedly') from e
[rank1]: RuntimeError: DataLoader worker (pid(s) 6030, 6038, 6047, 6070) exited unexpectedly
What have you tried?
What's your environment?
cuda12.2
linux
4* A100-8
OS (e.g., Linux):
FunASR Version (e.g., 1.0.0):
ModelScope Version (e.g., 1.11.0):
PyTorch Version (e.g., 2.0.0):
How you installed funasr (pip, source):
Python version:
GPU (e.g., V100M32)
CUDA/cuDNN version (e.g., cuda11.7):
Docker version (e.g., funasr-runtime-sdk-cpu-0.4.1)
Any other relevant information:
The text was updated successfully, but these errors were encountered:
Notice: In order to resolve issues more efficiently, please raise issue following the template.
(注意:为了更加高效率解决您遇到的问题,请按照模板提问,补充细节)
❓ Questions and Help
在训练paraformer 使用train.py进行大规模预训练时候,cpu内存会不断增加,最后达到100%,进而数据加载报错,应该如何解决
Before asking:
What is your question?
在训练paraformer 使用train.py进行大规模预训练时候,cpu内存会不断增加,最后达到100%,进而数据加载报错,应该如何解决
Code
[2024-08-14 21:57:58,231][root][INFO] - train, rank: 1, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.679), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.453), ('acc', 0.758), ('loss_pre', 0.048), ('loss', 0.501), ('batch_size', 172)], {'data_load': '0.001', 'forward_time': '0.324', 'backward_and_AllReaduce_time': '0.407', 'optim_time': '0.075', 'total_time': '0.808'}, GPU, memory: usage: 4.356 GB, peak: 39.970 GB, cache: 40.660 GB, cache_peak: 40.660 GB
[2024-08-14 21:57:58,231][root][INFO] - train, rank: 7, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.338), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.845), ('acc', 0.616), ('loss_pre', 0.148), ('loss', 0.993), ('batch_size', 200)], {'data_load': '0.000', 'forward_time': '0.339', 'backward_and_AllReaduce_time': '0.235', 'optim_time': '0.242', 'total_time': '0.816'}, GPU, memory: usage: 3.791 GB, peak: 38.241 GB, cache: 38.902 GB, cache_peak: 38.902 GB
[2024-08-14 21:57:58,232][root][INFO] - train, rank: 0, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.639), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.393), ('acc', 0.801), ('loss_pre', 0.047), ('loss', 0.441), ('batch_size', 185)], {'data_load': '0.000', 'forward_time': '0.331', 'backward_and_AllReaduce_time': '0.397', 'optim_time': '0.084', 'total_time': '0.813'}, GPU, memory: usage: 4.306 GB, peak: 41.477 GB, cache: 42.143 GB, cache_peak: 42.143 GB
[2024-08-14 21:57:58,235][root][INFO] - train, rank: 2, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.544), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.358), ('acc', 0.787), ('loss_pre', 0.044), ('loss', 0.402), ('batch_size', 155)], {'data_load': '0.000', 'forward_time': '0.338', 'backward_and_AllReaduce_time': '0.386', 'optim_time': '0.093', 'total_time': '0.819'}, GPU, memory: usage: 4.394 GB, peak: 55.395 GB, cache: 55.916 GB, cache_peak: 55.916 GB
[2024-08-14 21:57:58,235][root][INFO] - train, rank: 4, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.550), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.424), ('acc', 0.697), ('loss_pre', 0.062), ('loss', 0.486), ('batch_size', 103)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_and_AllReaduce_time': '0.405', 'optim_time': '0.073', 'total_time': '0.813'}, GPU, memory: usage: 4.703 GB, peak: 67.206 GB, cache: 67.770 GB, cache_peak: 67.770 GB
[2024-08-14 21:57:58,237][root][INFO] - train, rank: 6, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.294), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.788), ('acc', 0.614), ('loss_pre', 0.162), ('loss', 0.95), ('batch_size', 199)], {'data_load': '0.000', 'forward_time': '0.329', 'backward_and_AllReaduce_time': '0.239', 'optim_time': '0.246', 'total_time': '0.814'}, GPU, memory: usage: 3.786 GB, peak: 60.709 GB, cache: 61.479 GB, cache_peak: 61.479 GB
[2024-08-14 21:57:58,237][root][INFO] - train, rank: 3, epoch: 0/500, data_slice: 0/5, step_in_slice: 44000/156088, step_in_epoch: 80000, total step: 80000, (loss_avg_rank: 0.547), (loss_avg_slice: 0.623), (ppl_avg_slice: 1.865e+00), (acc_avg_slice: 0.694), (lr: 6.124e-04), [('loss_att', 0.376), ('acc', 0.759), ('loss_pre', 0.041), ('loss', 0.417), ('batch_size', 138)], {'data_load': '0.000', 'forward_time': '0.333', 'backward_and_AllReaduce_time': '0.398', 'optim_time': '0.083', 'total_time': '0.815'}, GPU, memory: usage: 4.479 GB, peak: 47.688 GB, cache: 48.240 GB, cache_peak: 48.240 GB
[2024-08-14 21:57:58,246][root][INFO] - Validate epoch: 0, rank: 3
[2024-08-14 21:57:58,248][root][INFO] - Validate epoch: 0, rank: 5
[2024-08-14 21:57:58,250][root][INFO] - Validate epoch: 0, rank: 1
[2024-08-14 21:57:58,250][root][INFO] - Validate epoch: 0, rank: 7
[2024-08-14 21:57:58,251][root][INFO] - Validate epoch: 0, rank: 0
[2024-08-14 21:57:58,253][root][INFO] - Validate epoch: 0, rank: 4
[2024-08-14 21:57:58,254][root][INFO] - Validate epoch: 0, rank: 2
[2024-08-14 21:57:58,255][root][INFO] - Validate epoch: 0, rank: 6
[2024-08-14 21:57:58,409][root][INFO] - rank: 3, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,412][root][INFO] - rank: 5, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,413][root][INFO] - rank: 1, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,414][root][INFO] - rank: 4, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,414][root][INFO] - rank: 0, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,417][root][INFO] - rank: 7, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,418][root][INFO] - rank: 2, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:57:58,419][root][INFO] - rank: 6, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:58:59,814][root][INFO] - rank: 5, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:02,055][root][INFO] - rank: 3, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:02,374][root][INFO] - rank: 6, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:03,529][root][INFO] - rank: 4, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:03,584][root][INFO] - rank: 1, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:05,045][root][INFO] - rank: 0, dataloader start from step: 0, batch_num: 65, after: 65
[2024-08-14 21:59:05,611][root][INFO] - rank: 2, dataloader start from step: 0, batch_num: 65, after: 65
[rank7]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 270, in
[rank7]: main_hydra()
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/main.py", line 94, in decorated_main
[rank7]: _run_hydra(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank7]: _run_app(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank7]: run_and_report(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank7]: raise ex
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank7]: return func()
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 458, in
[rank7]: lambda: hydra.run(
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank7]: _ = ret.return_value
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
[rank7]: raise self._return_value
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
[rank7]: ret.return_value = task_function(task_cfg)
[rank7]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 53, in main_hydra
[rank7]: main(**kwargs)
[rank7]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 215, in main
[rank7]: trainer.train_epoch(
[rank7]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 499, in train_epoch
[rank7]: self.validate_epoch(
[rank7]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 562, in validate_epoch
[rank7]: for batch_idx, batch in enumerate(dataloader_val):
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 439, in iter
[rank7]: return self._get_iterator()
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 387, in _get_iterator
[rank7]: return _MultiProcessingDataLoaderIter(self)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1085, in init
[rank7]: self._reset(loader, first_iter=True)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1088, in _reset
[rank7]: super()._reset(loader, first_iter)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 611, in _reset
[rank7]: self._sampler_iter = iter(self._index_sampler)
[rank7]: File "/lxc-data/FunASR/funasr/datasets/audio_datasets/samplers.py", line 400, in iter
[rank7]: potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)
[rank7]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
[rank7]: _error_if_any_worker_fails()
[rank7]: RuntimeError: DataLoader worker (pid 6044) is killed by signal: Killed.
Error executing job with overrides: ['++model=/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/modelscope_models/iic/speech_paraformer-large_big_asr_nat-zh-cn-16k-common-vocab8404-pytorch', '++train_data_set_list=../../../data/10w_list/15Wdata_nochat_5s_train.jsonl', '++valid_data_set_list=../../../data/10w_list/2Wdata_nochat_test.jsonl', '++dataset=AudioDataset', '++dataset_conf.index_ds=IndexDSJsonl', '++dataset_conf.data_split_num=5', '++dataset_conf.batch_sampler=BatchSampler', '++dataset_conf.batch_size=60000', '++dataset_conf.sort_size=1024', '++dataset_conf.batch_type=token', '++dataset_conf.num_workers=12', '++dataset_conf.max_token_length=1024', '++train_conf.max_epoch=500', '++train_conf.log_interval=100', '++train_conf.resume=true', '++train_conf.validate_interval=4000', '++train_conf.save_checkpoint_interval=4000', '++train_conf.keep_nbest_models=20', '++train_conf.avg_nbest_model=10', '++train_conf.use_deepspeed=false', '++train_conf.deepspeed_config=', '++optim_conf.lr=0.0006', '++output_dir=./15Wdata_train_big_large']
[rank1]: Traceback (most recent call last):
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1133, in _try_get_data
[rank1]: data = self._data_queue.get(timeout=timeout)
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/queue.py", line 180, in get
[rank1]: self.not_empty.wait(remaining)
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/threading.py", line 316, in wait
[rank1]: gotit = waiter.acquire(True, timeout)
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
[rank1]: _error_if_any_worker_fails()
[rank1]: RuntimeError: DataLoader worker (pid 6070) is killed by signal: Killed.
[rank1]: The above exception was the direct cause of the following exception:
[rank1]: Traceback (most recent call last):
[rank1]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 270, in
[rank1]: main_hydra()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/main.py", line 94, in decorated_main
[rank1]: _run_hydra(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank1]: _run_app(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank1]: run_and_report(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank1]: raise ex
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank1]: return func()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/utils.py", line 458, in
[rank1]: lambda: hydra.run(
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank1]: _ = ret.return_value
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
[rank1]: raise self._return_value
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
[rank1]: ret.return_value = task_function(task_cfg)
[rank1]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 53, in main_hydra
[rank1]: main(**kwargs)
[rank1]: File "/lxc-data/FunASR/examples/industrial_data_pretraining/paraformer/../../../funasr/bin/train.py", line 215, in main
[rank1]: trainer.train_epoch(
[rank1]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 499, in train_epoch
[rank1]: self.validate_epoch(
[rank1]: File "/lxc-data/FunASR/funasr/train_utils/trainer.py", line 562, in validate_epoch
[rank1]: for batch_idx, batch in enumerate(dataloader_val):
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 631, in next
[rank1]: data = self._next_data()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1329, in _next_data
[rank1]: idx, data = self._get_data()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1285, in _get_data
[rank1]: success, data = self._try_get_data()
[rank1]: File "/lxc-data/minianconda3/envs/python39/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1146, in _try_get_data
[rank1]: raise RuntimeError(f'DataLoader worker (pid(s) {pids_str}) exited unexpectedly') from e
[rank1]: RuntimeError: DataLoader worker (pid(s) 6030, 6038, 6047, 6070) exited unexpectedly
What have you tried?
What's your environment?
cuda12.2
linux
4* A100-8
pip
, source):The text was updated successfully, but these errors were encountered: