Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine cpm #179

Merged
merged 1 commit into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions training/benchmarks/cpm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
> 无需预处理

#### 模型checkpoint
[下载页](https://model.baai.ac.cn/model-detail/100017)
文件及版本tab页下,pytorch_model.bin
参数数:2.6B
[下载页](https://model.baai.ac.cn/model-detail/100105)
文件及版本tab页下,cpm_model_states_medium.pt
参数数:0.33B

### 框架与芯片支持情况
| | Pytorch |Paddle|TensorFlow2|
Expand Down
7 changes: 5 additions & 2 deletions training/benchmarks/cpm/pytorch/config/_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# required parameters
vendor: str = None
# model name
name: str = "CPM"
# Training data dir
data_dir: str = None
cudnn_benchmark: bool = False
cudnn_deterministic: bool = True

# random seed
seed: int = 1234
Expand Down Expand Up @@ -91,8 +96,6 @@
no_save_rng: bool = False

## data args
# Training data dir
data_dir: str = "/mnt/data/cpm/train/"

# path used to save/load sentencepiece tokenization models
tokenizer_path: str = "bpe_3w_new/"
Expand Down
66 changes: 24 additions & 42 deletions training/benchmarks/cpm/pytorch/run_pretraining.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
"""CPM Pretraining"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import random
import sys
import time

import numpy as np
import torch

import config
from dataloaders.tokenization_gpt2 import GPT2Tokenizer
from dataloaders.dataloader import load_data
from train.evaluator import Evaluator
Expand All @@ -22,38 +13,27 @@

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
from driver import Driver, Event, dist_pytorch, check
from driver import Event, dist_pytorch
from driver.helper import InitHelper

logger = None


def main():
import config
from config import mutable_params
global logger
global config

if config.use_env and 'LOCAL_RANK' in os.environ:
config.local_rank = int(os.environ['LOCAL_RANK'])

cpm_driver = Driver(config, config.mutable_params)
cpm_driver.setup_config(argparse.ArgumentParser("CPM"))
cpm_driver.setup_modules(globals(), locals())

init_helper = InitHelper(config)
cpm_driver = init_helper.init_driver(globals(), locals())
logger = cpm_driver.logger
dist_pytorch.init_dist_training_env(config)

check.check_config(config)

dist_pytorch.barrier(config.vendor)
cpm_driver.event(Event.INIT_START)
init_start_time = logger.previous_log_time

random.seed(config.seed)
np.random.seed(config.seed)
torch.manual_seed(config.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(config.seed)

init_helper.set_seed(config.seed, config.vendor)
# get the tokenizer
base_path = os.path.abspath(os.path.dirname(__file__))
tokenizer = GPT2Tokenizer(
Expand All @@ -70,7 +50,6 @@ def main():

evaluator = Evaluator(config, eval_dataloader)
training_state = TrainingState()
# trainer = Trainer(config, training_event, evaluator, training_state, device=device)
trainer = Trainer(driver=cpm_driver,
adapter=trainer_adapter,
evaluator=evaluator,
Expand All @@ -92,53 +71,56 @@ def main():
eval_loss=training_state.eval_avg_loss,
eval_embedding_average=training_state.eval_embedding_average,
time=init_evaluation_end - init_evaluation_start)
# training_event.on_init_evaluate(init_evaluation_info)
cpm_driver.event(Event.INIT_EVALUATION, init_evaluation_info)

if not config.do_train:
return config, training_state

# training_event.on_init_end()
cpm_driver.event(Event.INIT_END)
init_end_time = logger.previous_log_time
training_state.init_time = (init_end_time - init_start_time) / 1e+3

dist_pytorch.barrier(config.vendor)
epoch = -1
# training_event.on_train_begin()

cpm_driver.event(Event.TRAIN_START)
raw_train_start_time = logger.previous_log_time
train_start_time = time.time()
epoch = 0
while training_state.global_steps < config.max_steps and not training_state.end_training:
epoch += 1
training_state.epoch = epoch
trainer.train_one_epoch(train_dataloader)
# training_event.on_train_end()
epoch += 1
cpm_driver.event(Event.TRAIN_END)
raw_train_end_time = logger.previous_log_time
training_state.raw_train_time = (raw_train_end_time -
raw_train_start_time) / 1e+3
training_state.raw_train_time = time.time() - train_start_time
return config, training_state


if __name__ == "__main__":
now = time.time()
config, state = main()
config_updated, state = main()

if not dist_pytorch.is_main_process():
exit()

e2e_time = time.time() - now
training_perf = (dist_pytorch.global_batch_size(config) *
training_perf = (dist_pytorch.global_batch_size(config_updated) *
state.global_steps) / state.raw_train_time
if config.do_train:
if config_updated.do_train:
finished_info = {
"e2e_time": e2e_time,
"training_sequences_per_second": training_perf,
"converged": state.converged,
"final_loss": state.eval_avg_loss,
"final_mlm_accuracy": state.eval_embedding_average,
"raw_train_time": state.raw_train_time,
"init_time": state.init_time,
"raw_train_time": state.raw_train_time,
"train_no_eval_time": state.no_eval_time,
"pure_training_computing_time": state.pure_compute_time,
"throughput(ips)_raw":
state.num_trained_samples / state.raw_train_time,
"throughput(ips)_no_eval":
state.num_trained_samples / state.no_eval_time,
"throughput(ips)_pure_compute":
state.num_trained_samples / state.pure_compute_time,
}
else:
finished_info = {"e2e_time": e2e_time}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""PyTorch DataLoader for TFRecords"""

import torch
from torch.optim.lr_scheduler import _LRScheduler
import math

Expand Down
7 changes: 0 additions & 7 deletions training/benchmarks/cpm/pytorch/train/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
import os
import sys
import imp
import torch

import numpy as np
from train.metrics import average_corpus_level
from model.losses.cross_entropy import cross_entropy
from torch.nn import CrossEntropyLoss
from model.fp16 import FP16_Module

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
from driver import dist_pytorch


Expand Down
6 changes: 0 additions & 6 deletions training/benchmarks/cpm/pytorch/train/metrics.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
from __future__ import division
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function

import numpy as np
import collections

import torch

__all__ = [
"CorpusLevelScore",
Expand Down
29 changes: 12 additions & 17 deletions training/benchmarks/cpm/pytorch/train/trainer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import math
import time
import os
import sys

import torch
from torch.types import Device
Expand All @@ -13,9 +11,6 @@
from train.training_state import TrainingState
from model.losses.cross_entropy import cross_entropy
from model.fp16 import FP16_Module

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
from driver import Driver, Event, dist_pytorch


Expand Down Expand Up @@ -66,22 +61,22 @@ def _init_model(self, model, device):
def train_one_epoch(self, dataloader):
state = self.training_state
driver = self.driver
#training_event = self.training_event
driver.event(Event.EPOCH_BEGIN, state.epoch)
#training_event.on_epoch_begin(state.epoch)

step_start_time = time.time()

for _, data in enumerate(dataloader):
no_eval_start_time = time.time()
batch, no_model_batch = data[0], data[1]

state.global_steps += 1
state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size(
self.config)

#self.training_event.on_step_begin(state.global_steps)
driver.event(Event.STEP_BEGIN, step=state.global_steps)
self.train_one_step(batch, no_model_batch)

self.training_state.no_eval_time += time.time(
) - no_eval_start_time
other_state = dict()
if state.global_steps % self.config.gradient_accumulation_steps == 0:
step_end_time = time.time()
Expand All @@ -107,20 +102,17 @@ def train_one_epoch(self, dataloader):
end_training = self.detect_training_status(state)

step_info = state.to_dict(**other_state)
#self.training_event.on_step_end(state.global_steps, result=step_info)
driver.event(Event.STEP_END,
message=step_info,
step=state.global_steps,
loss=state.loss)

if eval_result is not None:
#self.training_event.on_evaluate(eval_result)
driver.event(Event.EVALUATE, eval_result)

if end_training:
break

#training_event.on_epoch_end(state.epoch)
driver.event(Event.EPOCH_END, state.epoch)

def train_one_step(self, batch, no_model_batch):
Expand All @@ -129,6 +121,7 @@ def train_one_step(self, batch, no_model_batch):
for k in no_model_batch:
no_model_batch[k] = no_model_batch[k].to(self.device)

pure_compute_start_time = time.time()
state = self.training_state
self.model.train()

Expand All @@ -142,6 +135,12 @@ def train_one_step(self, batch, no_model_batch):
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
state.loss = loss

self.adapter.backward(self.config, state.global_steps, state.loss,
self.optimizer)
self.training_state.pure_compute_time += time.time(
) - pure_compute_start_time

# calculate output
preds = torch.argmax(output, -1)
if isinstance(self.model.module, FP16_Module):
embeddings = self.model.module.module.word_embeddings.weight
Expand All @@ -155,11 +154,7 @@ def train_one_step(self, batch, no_model_batch):
embeddings.cpu().detach(),
no_model_batch["loss_mask"].cpu().detach())
state.embedding_average = float(embedding_average.mean)
#loss.backward()
#self.optimizer.step()
self.adapter.backward(self.config, state.global_steps, state.loss,
self.optimizer)
#self.training_event.on_backward(state.global_steps, state.loss, self.optimizer)

self.driver.event(Event.BACKWARD, state.global_steps, state.loss,
self.optimizer)
self.lr_scheduler.step()
Expand Down
2 changes: 2 additions & 0 deletions training/benchmarks/cpm/pytorch/train/training_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class TrainingState:

init_time = 0
raw_train_time = 0
no_eval_time = 0
pure_compute_time = 0

def status(self):
if self.converged:
Expand Down
33 changes: 26 additions & 7 deletions training/nvidia/cpm-pytorch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,29 @@

#### 运行情况

| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) |
| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
| 单机1卡 | config_A100x1x1 | 2016.20 | 0.8 | 0.8041 | 4375 | 77.65 |
| 单机2卡 | config_A100x1x2 | 1767.69 | 0.8 | 0.8010 | 3756 | 151.41 |
| 单机4卡 | config_A100x1x4 | 1651.28 | 0.8 | 0.8017 | 3454 | 298.22 |
| 单机8卡 | config_A100x1x8 | 1648.99 | 0.92 | 0.9201 | 3397 | 586.92 |
| 两机8卡 | config_A100x2x8 | 1453.51 | 0.92 | 0.9208 | 2760 | 1092.23 |
* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| -------------- | ------------------------------ | ------------------------------------------- |
| 任务类别 | 文本分类、文本生成 | |
| 模型 | cpm | |
| 数据集 | CPM-Finetune-data | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia A100 | |
| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB |
| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 |
| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) |
| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 |
| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) |
| 训练结果 | acc,见“性能指标” | 分类准确率(mlm_accuracy) |
| 额外修改项 | 无 | |

* 性能指标

| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem |
| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
| A100单机8卡(1x8) | fp16 | / | 1641 | 587 | 835 | 1059 | 0.92 | 12.9/40.0 |
| A100单机8卡(1x8) | fp16 | bs=128,lr=0.002 | 5469 | 771 | 1090 | 1292 | 0.918 | 23.1/40.0 |
| A100单机单卡(1x1) | fp16 | bs=192,lr=0.0005 | | 78.4 | 111.9 | 127.2 | | 34.8/40.0 |
| A100两机8卡(2x8) | fp16 | bs=192,lr=0.0005 | | 1583 | 2221 | 2583.8 | | 29.9/40.0 |
2 changes: 1 addition & 1 deletion training/nvidia/cpm-pytorch/config/config_A100x1x1.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

train_batch_size = 32
eval_batch_size = train_batch_size
max_steps = 4000000
max_steps = 60000
max_samples_termination = 439126000

warmup = 0.2
Expand Down
4 changes: 2 additions & 2 deletions training/nvidia/cpm-pytorch/config/config_A100x2x8.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@

gradient_accumulation_steps = 1

yuzhou03 marked this conversation as resolved.
Show resolved Hide resolved
train_batch_size = 32
train_batch_size = 192
eval_batch_size = train_batch_size
max_steps = 10000
max_steps = 2000

warmup = 0.2
learning_rate = 0.0005
Expand Down