Skip to content

Commit

Permalink
Merge pull request #76 from ScoThunder/cpm
Browse files Browse the repository at this point in the history
support cpm on xpu
  • Loading branch information
upvenly authored May 25, 2023
2 parents 216001d + 09078ef commit d583c5c
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 14 deletions.
7 changes: 6 additions & 1 deletion training/benchmarks/cpm/pytorch/model/models/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from model.layers.layers import ColumnParallelLinear
from model.layers.layers import RowParallelLinear
from .checkpoint import checkpoint


def ensure_divisibility(numerator, denominator):
Expand Down Expand Up @@ -451,6 +450,12 @@ def custom_forward(*inputs):
attention_mask_bk = copy.deepcopy(attention_mask)
while l < num_layers:
attention_mask = copy.deepcopy(attention_mask_bk)

import config
if config.vendor == 'kunlunxin':
from torch_xmlir.nn.checkpoint import checkpoint
else:
from .checkpoint import checkpoint
hidden_states = checkpoint(custom(l, l + chunk_length),
hidden_states, attention_mask)
l += chunk_length
Expand Down
10 changes: 5 additions & 5 deletions training/benchmarks/cpm/pytorch/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def main():

check.check_config(config)

dist_pytorch.barrier()
dist_pytorch.barrier(config.vendor)
cpm_driver.event(Event.INIT_START)
init_start_time = logger.previous_log_time

Expand Down Expand Up @@ -80,10 +80,10 @@ def main():

training_state._trainer = trainer

dist_pytorch.barrier()
dist_pytorch.barrier(config.vendor)
trainer.init()

dist_pytorch.barrier()
dist_pytorch.barrier(config.vendor)
init_evaluation_start = time.time()
training_state.eval_avg_loss, training_state.eval_embedding_average = evaluator.evaluate(
trainer)
Expand All @@ -96,14 +96,14 @@ def main():
cpm_driver.event(Event.INIT_EVALUATION, init_evaluation_info)

if not config.do_train:
return config, training_state, init_evaluation_info["time"]
return config, training_state

# training_event.on_init_end()
cpm_driver.event(Event.INIT_END)
init_end_time = logger.previous_log_time
training_state.init_time = (init_end_time - init_start_time) / 1e+3

dist_pytorch.barrier()
dist_pytorch.barrier(config.vendor)
epoch = -1
# training_event.on_train_begin()
cpm_driver.event(Event.TRAIN_START)
Expand Down
11 changes: 3 additions & 8 deletions training/benchmarks/cpm/pytorch/train/trainer_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,14 @@
from typing import Tuple

from model.models import gpt2_get_params_for_weight_decay_optimization
from apex.optimizers import FusedAdam as Adam

from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm

from torch.nn.parallel import DistributedDataParallel as NativeDDP

from apex.parallel import DistributedDataParallel as APEX_DDP

from model.fp16 import FP16_Module
from model.fp16 import FP16_Optimizer


def convert_model(config, model: nn.Module) -> nn.Module:
state_dict = model.state_dict()

from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
for i in range(config.num_layers):
model.transformer.layers[i].input_layernorm = LayerNorm(
config.hidden_size, config.layernorm_epsilon)
Expand All @@ -35,6 +28,7 @@ def convert_model(config, model: nn.Module) -> nn.Module:

def create_optimizer(config, model: nn.Module) -> Optimizer:
param_groups = gpt2_get_params_for_weight_decay_optimization(model)
from apex.optimizers import FusedAdam as Adam
optimizer = Adam(param_groups,
lr=config.learning_rate,
weight_decay=config.weight_decay_rate)
Expand Down Expand Up @@ -69,6 +63,7 @@ def model_to_ddp(config, model: nn.Module) -> nn.Module:
bucket_cap_mb=100,
gradient_as_bucket_view=config.use_gradient_as_bucket_view)
elif config.ddp_type == 'apex':
from apex.parallel import DistributedDataParallel as APEX_DDP
model = APEX_DDP(
model,
message_size=250000000,
Expand Down
33 changes: 33 additions & 0 deletions training/kunlunxin/cpm-pytorch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
### 模型Checkpoint下载
[模型Checkpoint下载](../../benchmarks/cpm/README.md#模型checkpoint)
### 测试数据集下载
[测试数据集下载](../../benchmarks/cpm/README.md#数据集)

### 昆仑芯XPU配置与运行信息参考
#### 环境配置
- ##### 硬件环境
- 机器型号: 昆仑芯AI加速器组R480-X8
- 加速卡型号: 昆仑芯AI加速卡R300
- 多机网络类型、带宽: InfiniBand,200Gb/s

- ##### 软件环境
- OS版本:Ubuntu 20.04
- OS kernel版本: 5.4.0-26-generic
- 加速卡驱动版本:4.0.25
- Docker镜像和版本:pytorch1.12.1-cpu-ubuntu18.04:v0.04
- 训练框架版本:xmlir+e70db8f6
- 依赖软件版本:pytorch-1.12.1+cpu


### 运行情况
| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) |
| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
| 单机1卡 | config_R300x1x1 | | | | | |
| 单机2卡 | config_R300x1x2 | | | | | |
| 单机4卡 | config_R300x1x4 | | | | | |
| 单机8卡 | config_R300x1x8 | | 0.92 | 0.9235 | 632 | |
| 两机8卡 | config_R300x2x8 | | | | | |

### 许可证

Apache 2.0 license。
13 changes: 13 additions & 0 deletions training/kunlunxin/cpm-pytorch/config/config_R300x1x8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from config_common import *

dist_backend = "xccl"

train_batch_size = 32
eval_batch_size = train_batch_size
max_steps = 4000
max_samples_termination = 4391260

warmup = 0.2
learning_rate = 0.0005

seed = 23333
7 changes: 7 additions & 0 deletions training/kunlunxin/cpm-pytorch/config/config_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# DDP type: 'apex' or 'native'.
ddp_type: str = "native"

# disable fp16
fp16 = False

vendor = 'kunlunxin'
21 changes: 21 additions & 0 deletions training/kunlunxin/cpm-pytorch/config/environment_variables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# =================================================
# Export variables
# =================================================
set -x

export XMLIR_F_XPU_ENABLED_BOOL=true
export XMLIR_TORCH_XCCL_ENABLED=true

##===----------------------------------------------------------------------===##
## R480 config
##===----------------------------------------------------------------------===##

# BKCL
topo_file=${WORKSPACE-"."}/topo.txt
touch topo_file
export XPUSIM_TOPOLOGY_FILE=$(readlink -f $topo_file)

## workaround due to ccix bug
export BKCL_CCIX_RING="1"
export ALLREDUCE_ASYNC="0"
export ALLREDUCE_FUSION="0"
25 changes: 25 additions & 0 deletions training/kunlunxin/cpm-pytorch/extern/trainer_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from model.models import gpt2_get_params_for_weight_decay_optimization

from torch import nn
from torch.optim import Optimizer
from typing import Tuple
from driver.dist_pytorch import main_proc_print


def convert_model(config, model: nn.Module) -> nn.Module:
return model


def create_optimizer(config, model):
param_groups = gpt2_get_params_for_weight_decay_optimization(model)
from torch.optim import Adam
optimizer = Adam(param_groups,
lr=config.learning_rate,
weight_decay=config.weight_decay_rate)

return optimizer


def model_to_fp16(config, model: nn.Module,
optimizer: Optimizer) -> Tuple[nn.Module, Optimizer]:
return model, optimizer

0 comments on commit d583c5c

Please sign in to comment.