Merge pull request #484 from Anhforth/enable_deepspeed

BAAI-OpenPlatform · web-flow · commit f77521ae9bba · 2023-07-06T16:41:21.000+08:00
fixed deepspeed bugs
diff --git a/examples/Aquila/Aquila-chat/Aquila-chat-deepspeed.yaml b/examples/Aquila/Aquila-chat/Aquila-chat-deepspeed.yaml
@@ -0,0 +1,17 @@
+batch_size: 1
+gradient_accumulation_steps: 1
+lr: 3.0e-4
+warm_up: 0.01
+
+bmt_cpu_offload: False
+bmt_pre_load: True
+env_type: 'deepspeed+mpu'
+save_interval: 300
+log_interval: 10
+warm_up_iters: 100
+save_optim: True
+save_rng: True
+lora: False
+eps: 1.0e-8
+enable_sft_dataset_dir: './data/'
+enable_sft_dataset_file: 'convo_samples.jsonl'
diff --git a/examples/Aquila/Aquila-chat/Aquila-chat.yaml b/examples/Aquila/Aquila-chat/Aquila-chat.yaml
@@ -1,4 +1,4 @@
-batch_size: 4
+batch_size: 1
 gradient_accumulation_steps: 1
 lr: 3.0e-4
 warm_up: 0.01
@@ -13,4 +13,4 @@ save_rng: True
 lora: False
 eps: 1.0e-8
 enable_sft_dataset_dir: './data/'
-enable_sft_dataset_file: 'sft_v0.9.4_train.jsonl'
+enable_sft_dataset_file: 'convo_samples.jsonl'
diff --git a/examples/Aquila/Aquila-chat/README.md b/examples/Aquila/Aquila-chat/README.md
@@ -131,7 +131,7 @@ python generate_chat_bminf.py
     aquila-7b                模型名称，注意需要小写
     aquila_experiment        实验名称，可自定义
     ```
-
+    **如果启动deepspeed微调(在单张V100上运行微调为例)，上一步改为运行**
 
     **如果启动LoRA微调(在单张V100上运行微调为例)，上一步改为运行**
     ```
diff --git a/examples/Aquila/Aquila-chat/hostfile b/examples/Aquila/Aquila-chat/hostfile
@@ -1 +1 @@
-192.168.21.4 slots=4
+192.168.21.7 slots=2
diff --git a/examples/Aquila/deepspeed.json b/examples/Aquila/deepspeed.json
@@ -0,0 +1,48 @@
+{
+    "train_micro_batch_size_per_gpu": 64,
+    "gradient_accumulation_steps": 1,
+    "steps_per_print": 100,
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+      "stage": 2,
+      "contiguous_gradients": false,
+      "overlap_comm": true,
+      "reduce_scatter": true,
+      "reduce_bucket_size": 5e7,
+      "allgather_bucket_size": 5e7,
+      "cpu_offload": true
+    },
+    "scheduler": {
+      "type": "WarmupLR",
+      "params": {
+          "warmup_min_lr": 0,
+          "warmup_max_lr": 1e-5,
+          "warmup_num_steps": 2000
+      }
+   },
+    "zero_allow_untested_optimizer": true,
+    "fp16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "optimizer": {
+      "type": "Adam",
+      "params": {
+        "lr": 1e-5,
+        "weight_decay": 0.1,
+        "betas": [
+          0.9,
+          0.98
+        ],
+        "eps": 1e-6
+      }
+    },
+    "activation_checkpointing": {
+      "partition_activations": true,
+      "contiguous_memory_optimization": false
+    },
+    "wall_clock_breakdown": false
+  }
diff --git a/flagai/fp16/fp16.py b/flagai/fp16/fp16.py
@@ -79,8 +79,8 @@ def forward(self, *inputs, **kwargs):
     def named_parameters(self, prefix: str = '', recurse: bool = True):
         return self.module.named_parameters(prefix=prefix, recurse=recurse)
 
-    def parameters(self):
-        return self.module.parameters()
+    def parameters(self, recurse: bool = True):
+        return self.module.parameters(recurse=recurse)
 
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         return self.module.state_dict(destination, prefix, keep_vars)
diff --git a/flagai/model/layers/feedforward.py b/flagai/model/layers/feedforward.py
@@ -42,7 +42,7 @@ def _initialize_affine_weight(weight,
         world_size = 1
     if world_size == 1:
         init_method(weight)
-        print(f"init weight {weight}")
+        # print(f"init weight {weight}")
         if return_master_weight:
             return weight
         return None
diff --git a/setup.py b/setup.py
@@ -18,12 +18,12 @@
     python_requires=">=3.8",
     install_requires=[
         'nltk==3.6.7',
-        'sentencepiece==0.1.96',
+        'sentencepiece>=0.1.96',
         'boto3==1.17.32',
         'pandas==1.3.5',
         'jieba==0.42.1',
         'scikit-learn==1.0.2',
-        'tensorboard==2.9.0',
+        'tensorboard>=2.9.0',
         'transformers>=4.20.1',
         'datasets>=2.0.0',
         'setuptools==66.0.0',
@@ -37,7 +37,7 @@
         'rouge-score',
         'sacrebleu==2.3.1',
         'jsonlines',
-        'accelerate==0.19.0',
+        'accelerate',
         'PyYAML==5.4.1',
         'safetensors',
     ]

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-192.168.21.4 slots=4`
	`1`	`+192.168.21.7 slots=2`