Skip to content

Commit

Permalink
[BUGFIX] fix hang when training and evaluation in multi-gpus model (#…
Browse files Browse the repository at this point in the history
…1681)

* fix: fix hang in multi-gpus

* rm del dataloader
  • Loading branch information
gongenlei authored Mar 21, 2022
1 parent eb5cf37 commit 8fecd6b
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions examples/language_model/ernie-m/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,17 +345,17 @@ def do_train(args):
evaluate(model, loss_fct, metric, test_data_loader,
language)
print("eval done total : %s s" % (time.time() - tic_eval))
if paddle.distributed.get_rank() == 0:
output_dir = os.path.join(
args.output_dir,
"ernie_m_ft_model_%d.pdparams" % (global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
if paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
"ernie_m_ft_model_%d.pdparams" %
(global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
if global_step >= num_training_steps:
break
if global_step >= num_training_steps:
Expand Down

0 comments on commit 8fecd6b

Please sign in to comment.