FlagOpen · upvenly · Aug 4, 2023 · Aug 4, 2023
diff --git a/training/benchmarks/cpm/README.md b/training/benchmarks/cpm/README.md
@@ -24,9 +24,9 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
 > 无需预处理 
 
 #### 模型checkpoint 
-[下载页](https://model.baai.ac.cn/model-detail/100017)
-文件及版本tab页下，pytorch_model.bin
-参数数：2.6B
+[下载页](https://model.baai.ac.cn/model-detail/100105)
+文件及版本tab页下，cpm_model_states_medium.pt
+参数数：0.33B
 
 ### 框架与芯片支持情况
 |     | Pytorch  |Paddle|TensorFlow2|

diff --git a/training/benchmarks/cpm/pytorch/config/_base.py b/training/benchmarks/cpm/pytorch/config/_base.py
@@ -1,6 +1,11 @@
+# required parameters
 vendor: str = None
 # model name
 name: str = "CPM"
+# Training data dir
+data_dir: str = None
+cudnn_benchmark: bool = False
+cudnn_deterministic: bool = True
 
 # random seed
 seed: int = 1234
@@ -91,8 +96,6 @@
 no_save_rng: bool = False
 
 ## data args
-# Training data dir
-data_dir: str = "/mnt/data/cpm/train/"
 
 # path used to save/load sentencepiece tokenization models
 tokenizer_path: str = "bpe_3w_new/"

diff --git a/training/benchmarks/cpm/pytorch/run_pretraining.py b/training/benchmarks/cpm/pytorch/run_pretraining.py
@@ -1,18 +1,9 @@
 """CPM Pretraining"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
 import os
-import random
 import sys
 import time
 
-import numpy as np
-import torch
-
+import config
 from dataloaders.tokenization_gpt2 import GPT2Tokenizer
 from dataloaders.dataloader import load_data
 from train.evaluator import Evaluator
@@ -22,38 +13,27 @@
 
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
-from driver import Driver, Event, dist_pytorch, check
+from driver import Event, dist_pytorch
+from driver.helper import InitHelper
 
 logger = None
 
 
 def main():
-    import config
-    from config import mutable_params
     global logger
+    global config
 
     if config.use_env and 'LOCAL_RANK' in os.environ:
         config.local_rank = int(os.environ['LOCAL_RANK'])
 
-    cpm_driver = Driver(config, config.mutable_params)
-    cpm_driver.setup_config(argparse.ArgumentParser("CPM"))
-    cpm_driver.setup_modules(globals(), locals())
-
+    init_helper = InitHelper(config)
+    cpm_driver = init_helper.init_driver(globals(), locals())
     logger = cpm_driver.logger
     dist_pytorch.init_dist_training_env(config)
-
-    check.check_config(config)
-
     dist_pytorch.barrier(config.vendor)
     cpm_driver.event(Event.INIT_START)
     init_start_time = logger.previous_log_time
-
-    random.seed(config.seed)
-    np.random.seed(config.seed)
-    torch.manual_seed(config.seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(config.seed)
-
+    init_helper.set_seed(config.seed, config.vendor)
     # get the tokenizer
     base_path = os.path.abspath(os.path.dirname(__file__))
     tokenizer = GPT2Tokenizer(
@@ -70,7 +50,6 @@ def main():
 
     evaluator = Evaluator(config, eval_dataloader)
     training_state = TrainingState()
-    # trainer = Trainer(config, training_event, evaluator, training_state, device=device)
     trainer = Trainer(driver=cpm_driver,
                       adapter=trainer_adapter,
                       evaluator=evaluator,
@@ -92,53 +71,56 @@ def main():
         eval_loss=training_state.eval_avg_loss,
         eval_embedding_average=training_state.eval_embedding_average,
         time=init_evaluation_end - init_evaluation_start)
-    # training_event.on_init_evaluate(init_evaluation_info)
     cpm_driver.event(Event.INIT_EVALUATION, init_evaluation_info)
 
     if not config.do_train:
         return config, training_state
 
-    # training_event.on_init_end()
     cpm_driver.event(Event.INIT_END)
     init_end_time = logger.previous_log_time
     training_state.init_time = (init_end_time - init_start_time) / 1e+3
 
     dist_pytorch.barrier(config.vendor)
-    epoch = -1
-    # training_event.on_train_begin()
+
     cpm_driver.event(Event.TRAIN_START)
-    raw_train_start_time = logger.previous_log_time
+    train_start_time = time.time()
+    epoch = 0
     while training_state.global_steps < config.max_steps and not training_state.end_training:
-        epoch += 1
         training_state.epoch = epoch
         trainer.train_one_epoch(train_dataloader)
-    # training_event.on_train_end()
+        epoch += 1
     cpm_driver.event(Event.TRAIN_END)
-    raw_train_end_time = logger.previous_log_time
-    training_state.raw_train_time = (raw_train_end_time -
-                                     raw_train_start_time) / 1e+3
+    training_state.raw_train_time = time.time() - train_start_time
     return config, training_state
 
 
 if __name__ == "__main__":
     now = time.time()
-    config, state = main()
+    config_updated, state = main()
 
     if not dist_pytorch.is_main_process():
         exit()
 
     e2e_time = time.time() - now
-    training_perf = (dist_pytorch.global_batch_size(config) *
+    training_perf = (dist_pytorch.global_batch_size(config_updated) *
                      state.global_steps) / state.raw_train_time
-    if config.do_train:
+    if config_updated.do_train:
         finished_info = {
             "e2e_time": e2e_time,
             "training_sequences_per_second": training_perf,
             "converged": state.converged,
             "final_loss": state.eval_avg_loss,
             "final_mlm_accuracy": state.eval_embedding_average,
-            "raw_train_time": state.raw_train_time,
             "init_time": state.init_time,
+            "raw_train_time": state.raw_train_time,
+            "train_no_eval_time": state.no_eval_time,
+            "pure_training_computing_time": state.pure_compute_time,
+            "throughput(ips)_raw":
+            state.num_trained_samples / state.raw_train_time,
+            "throughput(ips)_no_eval":
+            state.num_trained_samples / state.no_eval_time,
+            "throughput(ips)_pure_compute":
+            state.num_trained_samples / state.pure_compute_time,
         }
     else:
         finished_info = {"e2e_time": e2e_time}

diff --git a/training/benchmarks/cpm/pytorch/schedulers/learning_rates.py b/training/benchmarks/cpm/pytorch/schedulers/learning_rates.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch DataLoader for TFRecords"""
 
-import torch
 from torch.optim.lr_scheduler import _LRScheduler
 import math
 

diff --git a/training/benchmarks/cpm/pytorch/train/evaluator.py b/training/benchmarks/cpm/pytorch/train/evaluator.py
@@ -1,16 +1,9 @@
-import os
-import sys
-import imp
 import torch
 
 import numpy as np
 from train.metrics import average_corpus_level
 from model.losses.cross_entropy import cross_entropy
-from torch.nn import CrossEntropyLoss
 from model.fp16 import FP16_Module
-
-CURR_PATH = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
 from driver import dist_pytorch
 
 

diff --git a/training/benchmarks/cpm/pytorch/train/metrics.py b/training/benchmarks/cpm/pytorch/train/metrics.py
@@ -1,12 +1,6 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from __future__ import print_function
-
 import numpy as np
 import collections
 
-import torch
 
 __all__ = [
     "CorpusLevelScore",

diff --git a/training/benchmarks/cpm/pytorch/train/trainer.py b/training/benchmarks/cpm/pytorch/train/trainer.py
@@ -1,7 +1,5 @@
 import math
 import time
-import os
-import sys
 
 import torch
 from torch.types import Device
@@ -13,9 +11,6 @@
 from train.training_state import TrainingState
 from model.losses.cross_entropy import cross_entropy
 from model.fp16 import FP16_Module
-
-CURR_PATH = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
 from driver import Driver, Event, dist_pytorch
 
 
@@ -66,22 +61,22 @@ def _init_model(self, model, device):
     def train_one_epoch(self, dataloader):
         state = self.training_state
         driver = self.driver
-        #training_event = self.training_event
         driver.event(Event.EPOCH_BEGIN, state.epoch)
-        #training_event.on_epoch_begin(state.epoch)
 
         step_start_time = time.time()
+
         for _, data in enumerate(dataloader):
+            no_eval_start_time = time.time()
             batch, no_model_batch = data[0], data[1]
 
             state.global_steps += 1
             state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size(
                 self.config)
 
-            #self.training_event.on_step_begin(state.global_steps)
             driver.event(Event.STEP_BEGIN, step=state.global_steps)
             self.train_one_step(batch, no_model_batch)
-
+            self.training_state.no_eval_time += time.time(
+            ) - no_eval_start_time
             other_state = dict()
             if state.global_steps % self.config.gradient_accumulation_steps == 0:
                 step_end_time = time.time()
@@ -107,20 +102,17 @@ def train_one_epoch(self, dataloader):
             end_training = self.detect_training_status(state)
 
             step_info = state.to_dict(**other_state)
-            #self.training_event.on_step_end(state.global_steps, result=step_info)
             driver.event(Event.STEP_END,
                          message=step_info,
                          step=state.global_steps,
                          loss=state.loss)
 
             if eval_result is not None:
-                #self.training_event.on_evaluate(eval_result)
                 driver.event(Event.EVALUATE, eval_result)
 
             if end_training:
                 break
 
-        #training_event.on_epoch_end(state.epoch)
         driver.event(Event.EPOCH_END, state.epoch)
 
     def train_one_step(self, batch, no_model_batch):
@@ -129,6 +121,7 @@ def train_one_step(self, batch, no_model_batch):
         for k in no_model_batch:
             no_model_batch[k] = no_model_batch[k].to(self.device)
 
+        pure_compute_start_time = time.time()
         state = self.training_state
         self.model.train()
 
@@ -142,6 +135,12 @@ def train_one_step(self, batch, no_model_batch):
         loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
         state.loss = loss
 
+        self.adapter.backward(self.config, state.global_steps, state.loss,
+                              self.optimizer)
+        self.training_state.pure_compute_time += time.time(
+        ) - pure_compute_start_time
+
+        # calculate output
         preds = torch.argmax(output, -1)
         if isinstance(self.model.module, FP16_Module):
             embeddings = self.model.module.module.word_embeddings.weight
@@ -155,11 +154,7 @@ def train_one_step(self, batch, no_model_batch):
             embeddings.cpu().detach(),
             no_model_batch["loss_mask"].cpu().detach())
         state.embedding_average = float(embedding_average.mean)
-        #loss.backward()
-        #self.optimizer.step()
-        self.adapter.backward(self.config, state.global_steps, state.loss,
-                              self.optimizer)
-        #self.training_event.on_backward(state.global_steps, state.loss, self.optimizer)
+
         self.driver.event(Event.BACKWARD, state.global_steps, state.loss,
                           self.optimizer)
         self.lr_scheduler.step()

diff --git a/training/benchmarks/cpm/pytorch/train/training_state.py b/training/benchmarks/cpm/pytorch/train/training_state.py
@@ -26,6 +26,8 @@ class TrainingState:
 
     init_time = 0
     raw_train_time = 0
+    no_eval_time = 0
+    pure_compute_time = 0
 
     def status(self):
         if self.converged:

diff --git a/training/nvidia/cpm-pytorch/README.md b/training/nvidia/cpm-pytorch/README.md
@@ -22,10 +22,29 @@
 
 #### 运行情况
 
-| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能（samples/s) |
-| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
-| 单机1卡  | config_A100x1x1 | 2016.20     | 0.8      | 0.8041   | 4375    | 77.65            |
-| 单机2卡  | config_A100x1x2 | 1767.69     | 0.8      | 0.8010   | 3756    | 151.41           |
-| 单机4卡  | config_A100x1x4 | 1651.28     | 0.8      | 0.8017   | 3454    | 298.22           |
-| 单机8卡  | config_A100x1x8 | 1648.99     | 0.92     | 0.9201   | 3397    | 586.92           |
-| 两机8卡  | config_A100x2x8 | 1453.51     | 0.92     | 0.9208   | 2760    | 1092.23          |
+* 通用指标
+
+| 指标名称       | 指标值                         | 特殊说明                                    |
+| -------------- | ------------------------------ | ------------------------------------------- |
+| 任务类别       | 文本分类、文本生成             |                                             |
+| 模型           | cpm                            |                                             |
+| 数据集         | CPM-Finetune-data              |                                             |
+| 数据精度       | precision,见“性能指标”         | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”            | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | nvidia A100                    |                                             |
+| 硬件存储使用   | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”          | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”           | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”           | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”**        | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | acc,见“性能指标”               | 分类准确率(mlm_accuracy)                    |
+| 额外修改项     | 无                             |                                             |
+
+* 性能指标
+
+| 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | acc   | mem       |
+| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
+| A100单机8卡（1x8）  | fp16      | /                | 1641     | 587     | 835     | 1059   | 0.92  | 12.9/40.0 |
+| A100单机8卡（1x8）  | fp16      | bs=128,lr=0.002  | 5469     | 771     | 1090    | 1292   | 0.918 | 23.1/40.0 |
+| A100单机单卡（1x1） | fp16      | bs=192,lr=0.0005 |          | 78.4    | 111.9   | 127.2  |       | 34.8/40.0 |
+| A100两机8卡（2x8）  | fp16      | bs=192,lr=0.0005 |          | 1583    | 2221    | 2583.8 |       | 29.9/40.0 |
diff --git a/training/nvidia/cpm-pytorch/config/config_A100x1x1.py b/training/nvidia/cpm-pytorch/config/config_A100x1x1.py
@@ -9,7 +9,7 @@
 
 train_batch_size = 32
 eval_batch_size = train_batch_size
-max_steps = 4000000
+max_steps = 60000
 max_samples_termination = 439126000
 
 warmup = 0.2

diff --git a/training/nvidia/cpm-pytorch/config/config_A100x2x8.py b/training/nvidia/cpm-pytorch/config/config_A100x2x8.py
@@ -8,9 +8,9 @@
 
 gradient_accumulation_steps = 1
 
-train_batch_size = 32
+train_batch_size = 192
 eval_batch_size = train_batch_size
-max_steps = 10000
+max_steps = 2000
 
 warmup = 0.2
 learning_rate = 0.0005