Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bert #158

Merged
merged 45 commits into from
Jul 20, 2023
Merged

Bert #158

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
f738bac
init
May 19, 2023
b494ba5
add efficientnet
May 24, 2023
acfde41
modify config
May 24, 2023
b4e9627
modify config
May 24, 2023
c6fbea3
modify config
May 24, 2023
fce71f2
add efficientnet
May 24, 2023
ef390bc
modify config
May 24, 2023
51847e1
add efficientnet
May 25, 2023
3f904db
bug fix
May 25, 2023
48e835d
add efficientnet
May 25, 2023
8eaa8a5
Merge branch 'FlagOpen:main' into efficientnet
ScoThunder May 25, 2023
37d78be
add efficientnet
May 25, 2023
98361a5
fix code style
May 26, 2023
e6005bf
fix code style
May 26, 2023
ae86109
fix code style
May 29, 2023
fe6a418
Revert "fix code style"
May 29, 2023
6684a5d
fix code style
May 29, 2023
746377a
fix code style
May 29, 2023
b3d9786
fix code style
May 29, 2023
a70db8d
fix code style
May 29, 2023
b672228
fix code style
May 29, 2023
df3a2b2
Merge branch 'FlagOpen:main' into efficientnet
ScoThunder May 30, 2023
565a35d
Merge branch 'FlagOpen:main' into main
ScoThunder May 31, 2023
b21dde9
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 2, 2023
d9c089b
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 5, 2023
d3b3e57
bug fix
Jun 6, 2023
ffda7ad
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 6, 2023
cb97a53
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 7, 2023
10faf97
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 8, 2023
d679833
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 20, 2023
2fad460
add kunlunxin readme
Jun 21, 2023
9db4cab
Merge branch 'FlagOpen:main' into main
ScoThunder Jun 28, 2023
ba5aa39
Merge branch 'FlagOpen:main' into main
ScoThunder Jul 4, 2023
01a1106
fix bert on gpu
Jul 11, 2023
a61cc9a
fix bert on gpu
Jul 11, 2023
c602432
fix bert on gpu
Jul 11, 2023
072d81c
bug fix
Jul 13, 2023
59c77e5
bug fix
Jul 13, 2023
44715ca
bug fix
Jul 14, 2023
e793bb6
fix bert on gpu
Jul 14, 2023
714eaf5
fix bert on gpu
Jul 14, 2023
7242cb5
Merge remote-tracking branch 'upgrade/main' into bert
Jul 19, 2023
b467d58
fix bert on xpu
Jul 19, 2023
1aac59d
fix bert on xpu
Jul 19, 2023
2bc4922
fix bert on xpu
Jul 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions training/benchmarks/bert/pytorch/train/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,6 @@ def evaluate(self, trainer):
total_masked += num_masked
#torch.cuda.synchronize()
dist_pytorch.barrier(config.vendor)
if config.vendor == 'kunlunxin':
import torch_xmlir.core.xpu_model as xm
xm.mark_step()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

之前跑图模式需要这个xm.mark_step,现在是eager模式,不需要了

trainer.model.train()

if torch.distributed.is_initialized():
Expand Down
24 changes: 13 additions & 11 deletions training/benchmarks/bert/pytorch/train/trainer_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,32 @@

import torch
import torch.distributed as dist
try:
import amp_C
import apex_C
from apex import amp
from apex.amp import _amp_state
from apex.contrib.optimizers.distributed_fused_lamb import DistributedFusedLAMB
from apex.optimizers import FusedLAMB
from apex.parallel import DistributedDataParallel as APEX_DDP
from apex.parallel.distributed import flat_dist_call
except ImportError:
print("import apex error")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kunlunxin的机器没有apex,import会报错,加入try catch


import amp_C
import apex_C
from apex import amp
from apex.amp import _amp_state
from apex.contrib.optimizers.distributed_fused_lamb import DistributedFusedLAMB
from apex.optimizers import FusedLAMB
from apex.parallel import DistributedDataParallel as APEX_DDP
from apex.parallel.distributed import flat_dist_call
from torch.cuda.amp import GradScaler
from torch.nn.parallel import DistributedDataParallel as NativeDDP
from torch.optim import Optimizer

import utils
import config
#from converter import convert_model
from .distributed_fused_lamb import _pipeline_block_reductions_patched, _pipeline_step_patched
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上


BERT_MODEL = torch.nn.Module

BERT_MODEL = torch.nn.Module

def convert_model(model: BERT_MODEL) -> BERT_MODEL:
return model


def create_optimizer(model: BERT_MODEL) -> Optimizer:
param_optimizer = list(model.named_parameters())

Expand All @@ -46,6 +47,7 @@ def create_optimizer(model: BERT_MODEL) -> Optimizer:
}]

if config.distributed_lamb:
from .distributed_fused_lamb import _pipeline_block_reductions_patched, _pipeline_step_patched
DistributedFusedLAMB._pipeline_block_reductions = _pipeline_block_reductions_patched
DistributedFusedLAMB._pipeline_step = _pipeline_step_patched
optimizer = DistributedFusedLAMB(
Expand Down
35 changes: 0 additions & 35 deletions training/kunlunxin/bert-pytorch/config/config_R200x1x8.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
from config_common import *

fp16 = False
ddp_type = "apex"
#dist_backend = "nccl"
dist_backend = "xccl"
use_xpu = True
use_env = True
gradient_accumulation_steps = 1
train_batch_size = 12
max_steps = 1000000
gradient_accumulation_steps = 56
train_batch_size = 8
max_steps = 240000
start_warmup_step = 0
warmup_proportion = 0
warmup_steps = 1000
warmup_steps = 0

distributed_lamb = False
learning_rate = 1.4e-4
learning_rate = 0.00035
weight_decay_rate = 0.01
opt_lamb_beta_1 = 0.9
opt_lamb_beta_2 = 0.999

eval_batch_size = train_batch_size
max_samples_termination = 45000000
cache_eval_data = True
cache_eval_data = False

fused_gelu_bias = False
fused_mha = False
Expand All @@ -33,3 +29,6 @@
dwu_num_blocks = 1

seed = 9031

from torch_xmlir.amp import GradScaler
grad_scaler = GradScaler(enabled=False)
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
from config_common import *

fp16 = False
ddp_type = "apex"
#dist_backend = "nccl"
dist_backend = "xccl"
use_xpu = True
use_env = True
gradient_accumulation_steps = 1
train_batch_size = 12
max_steps = 1000000
gradient_accumulation_steps = 7
train_batch_size = 8
max_steps = 30000
start_warmup_step = 0
warmup_proportion = 0
warmup_steps = 1000
warmup_steps = 0

distributed_lamb = False
learning_rate = 3.5e-5
learning_rate = 0.00035
weight_decay_rate = 0.01
opt_lamb_beta_1 = 0.9
opt_lamb_beta_2 = 0.999

eval_batch_size = train_batch_size
max_samples_termination = 45000000
cache_eval_data = True
cache_eval_data = False

fused_gelu_bias = False
fused_mha = False
Expand All @@ -33,3 +29,6 @@
dwu_num_blocks = 1

seed = 9031

from torch_xmlir.amp import GradScaler
grad_scaler = GradScaler(enabled=False)
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
from config_common import *

fp16 = False
ddp_type = "apex"
#dist_backend = "nccl"
dist_backend = "xccl"
use_xpu = True
use_env = True
gradient_accumulation_steps = 1
train_batch_size = 12
max_steps = 1000000
gradient_accumulation_steps = 4
train_batch_size = 8
max_steps = 30000
start_warmup_step = 0
warmup_proportion = 0
warmup_steps = 1000
warmup_steps = 0

distributed_lamb = False
learning_rate = 1.4e-4
learning_rate = 0.00035
weight_decay_rate = 0.01
opt_lamb_beta_1 = 0.9
opt_lamb_beta_2 = 0.999

eval_batch_size = train_batch_size
max_samples_termination = 45000000
cache_eval_data = True
cache_eval_data = False

fused_gelu_bias = False
fused_mha = False
Expand All @@ -33,3 +29,6 @@
dwu_num_blocks = 1

seed = 9031

from torch_xmlir.amp import GradScaler
grad_scaler = GradScaler(enabled=False)
106 changes: 7 additions & 99 deletions training/kunlunxin/bert-pytorch/extern/trainer_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import torch.distributed as dist

from torch.cuda.amp import GradScaler
from torch.nn.parallel import DistributedDataParallel as NativeDDP
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Optimizer
from torch_xmlir.optimizer import Lamb
from torch_xmlir.optimizer import FusedLAMB
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

替换为fuse优化器

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请提供kulunxin机器上跑通的截图


import torch_xmlir.core.xpu_model as xm

Expand Down Expand Up @@ -38,26 +38,7 @@ def create_optimizer(model: BERT_MODEL) -> Optimizer:
0.0
}]

if config.distributed_lamb:
print("distributed")
#DistributedFusedLAMB._pipeline_block_reductions = _pipeline_block_reductions_patched
#DistributedFusedLAMB._pipeline_step = _pipeline_step_patched
#optimizer = DistributedFusedLAMB(optimizer_grouped_parameters, lr=config.learning_rate,
# betas=(config.opt_lamb_beta_1, config.opt_lamb_beta_2),
# eps=1e-6,
# max_grad_norm=1.0,
# overlap_reductions=config.dwu_overlap_reductions,
# dwu_group_size=config.dwu_group_size,
# dwu_num_blocks=config.dwu_num_blocks,
# dwu_num_chunks=config.dwu_num_chunks,
# dwu_num_rs_pg=config.dwu_num_rs_pg,
# dwu_num_ar_pg=config.dwu_num_ar_pg,
# dwu_num_ag_pg=config.dwu_num_ag_pg,
# use_nvlamb=False,
# e5m2_allgather=config.dwu_e5m2_allgather)
#optimizer.set_global_scale(float(os.getenv("INIT_LOSS_SCALE", 2 ** 20)))
else:
optimizer = Lamb(optimizer_grouped_parameters,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

删除多余代码

optimizer = FusedLAMB(optimizer_grouped_parameters,
lr=config.learning_rate,
betas=(config.opt_lamb_beta_1,
config.opt_lamb_beta_2))
Expand All @@ -71,97 +52,24 @@ def model_to_fp16(model: BERT_MODEL,


def model_to_ddp(model: BERT_MODEL) -> BERT_MODEL:
use_ddp = dist.is_initialized()
if use_ddp and config.use_xpu:
from torch_xmlir.distributed import DistributedDataParallel as DDP
model = DDP(model)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

替换为torch原生的ddp,而不是自定的ddp

if dist.is_available() and dist.is_initialized():
model = DDP(model, device_ids=[config.local_rank])
return model


def backward(step: int,
loss: torch.Tensor,
optimizer: Optimizer,
grad_scaler: GradScaler = None):
if config.bypass_amp:
loss.backward()
elif config.distributed_lamb:
optimizer._lazy_init_stage1()
grad_scaler.scale(loss).backward()
optimizer._lazy_init_stage2()
else:
with amp.scale_loss(loss,
optimizer,
delay_overflow_check=self.config.
allreduce_post_accumulation) as scaled_loss:
scaled_loss.backward()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

删除冗余代码

loss.backward()

update_step = step % config.gradient_accumulation_steps == 0
if update_step:
update_model_params(loss, optimizer, grad_scaler)
else:
xm.mark_step()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

eager模式不用xm.mark_step



def update_model_params(loss,
optimizer: Optimizer,
grad_scaler: GradScaler = None):
if config.allreduce_post_accumulation and config.use_cuda_graph:
assert False, "code path not tested with cuda graphs"
if config.distributed_lamb:
optimizer.set_global_scale(grad_scaler._get_scale_async())
optimizer.complete_reductions()
grad_scaler.step(optimizer)
grad_scaler.update()

found_inf = optimizer._overflow_buf # GPU tensor

elif config.allreduce_post_accumulation:
# manually allreduce gradients after all accumulation steps
# check for Inf/NaN
# 1. allocate an uninitialized buffer for flattened gradient
# torch.nn.utils.clip_grad_norm_(parameters=amp.master_params(optimizer), max_norm=1.0, norm_type=2.0)
#scaler = _amp_state.loss_scalers[0]
# master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
# flat_grad_size = sum(p.numel() for p in master_grads)
# allreduce_dtype = torch.float16 if config.allreduce_post_accumulation_fp16 else torch.float32
# flat_raw = torch.empty(flat_grad_size, device='cpu', dtype=allreduce_dtype)
#flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
# 2. combine unflattening and predivision of unscaled 'raw' gradient
#allreduced_views = apex_C.unflatten(flat_raw, master_grads)
#self.overflow_buf.zero_()
#amp_C.multi_tensor_scale(65536,
# self.overflow_buf,
# [master_grads, allreduced_views],
# scaler.loss_scale() / (
# torch.distributed.get_world_size() * config.gradient_accumulation_steps))
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
#torch.distributed.all_reduce(flat_raw)
# 4. combine unscaling and unflattening of allreduced gradient
#self.overflow_buf.zero_()
#amp_C.multi_tensor_scale(65536,
# self.overflow_buf,
# [allreduced_views, master_grads],
# 1. / scaler.loss_scale())
# 5. update loss scale
had_overflow = 0
#scaler = _amp_state.loss_scalers[0]
#old_overflow_buf = scaler._overflow_buf
#scaler._overflow_buf = self.overflow_buf
#had_overflow = scaler.update_scale()
#scaler._overflow_buf = old_overflow_buf
# 6. call optimizer step function
if had_overflow == 0:
# optimizer.step()
xm.optimizer_step(optimizer, barrier=True)
else:
# Overflow detected, print message and clear gradients
if utils.is_main_process():
print("Overflow detected, reduced loss_scaler to %f" %
(scaler.loss_scale()))
if _amp_state.opt_properties.master_weights:
for param in optimizer._amp_stash.all_fp32_from_fp16_params:
param.grad = None
else:
xm.optimizer_step(optimizer, barrier=True)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

删除多余代码

optimizer.step()
optimizer.zero_grad()