Opt 30b (#16)

920232796 · marscrazy · web-flow · commit 2763b5d76698 · 2022-07-01T15:24:41.000+08:00
* clean codes 
Co-authored-by: Zac Liu &lt;liuguang@baai.ac.cn&gt;
diff --git a/examples/opt/generate_opt_30b.py b/examples/opt/generate_opt_30b.py
@@ -1,15 +1,13 @@
 from flagai.model.predictor.predictor import Predictor
 from flagai.auto_model.auto_loader import AutoLoader
 import torch
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 loader = AutoLoader(task_name="lm",
                     model_name="opt-30b-en")
 
 model = loader.get_model()
 tokenizer = loader.get_tokenizer()
 model.eval()
-model.to(device)
 
 text = "The trophy doesn’t fit in the suitcase because "
 predictor = Predictor(model, tokenizer)
diff --git a/examples/opt/opt_30b_en_mutigpu.py b/examples/opt/opt_30b_en_mutigpu.py
@@ -1,4 +1,4 @@
-# os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"
+
 import torch
 import os
 import argparse
@@ -7,8 +7,11 @@
 import random
 import numpy as np
 from flagai.model.predictor.predictor import Predictor
+import glob
+import time
+
+# run script : python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 opt_30b_en_mutigpu.py
 
-# run script : python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 glm_blank_filling_QA_ch_mutigpu.py
 os.environ["ENV_TYPE"] = "deepspeed+mpu"
 model_parallel_size = 4
 world_size = 4
@@ -58,11 +61,15 @@ def initialize_distributed():
 
 set_random_seed(123)
 
-loader = AutoLoader("lm", model_name="opt-350m-en")
+
+print(f"building model...")
+loader = AutoLoader("lm", model_name="opt-30b-en")
 model = loader.get_model()
-model.half()
 tokenizer = loader.get_tokenizer()
-# model.parallel_output = False
+model.half()
+
+model.parallel_output = False
+
 model.eval()
 model.to(device)
 
@@ -75,4 +82,3 @@ def initialize_distributed():
 if mpu.get_model_parallel_rank() == 0:
     print(f"pred is {out}")
 
-
diff --git a/flagai/model/gpt2_model.py b/flagai/model/gpt2_model.py
@@ -11,12 +11,9 @@
 import torch.nn.functional as F
 
 if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
-    from flagai.mpu import get_model_parallel_world_size
-    from flagai.mpu import get_cuda_rng_tracker
     from flagai.mpu.utils import divide
-if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
     from flagai.mpu.random import checkpoint
-    from flagai.mpu import copy_to_model_parallel_region, gather_from_model_parallel_region
+    from flagai.mpu import copy_to_model_parallel_region, gather_from_model_parallel_region, get_model_parallel_world_size, get_cuda_rng_tracker
     from flagai.mpu.cross_entropy import vocab_parallel_cross_entropy
 
 elif os.getenv('ENV_TYPE') == 'deepspeed':
@@ -321,19 +318,6 @@ def forward(
                         None,
                 }
 
-        # lm_logits = self.lm_head(hidden_states)
-        # return_data = {"logits": lm_logits}
-        # if labels is not None:
-        #     # Shift so that tokens < n predict n
-        #     shift_logits = lm_logits[..., :-1, :].contiguous()
-        #     shift_labels = labels[..., 1:].contiguous()
-        #     loss_fct = nn.CrossEntropyLoss()
-        #     loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-        #                     shift_labels.view(-1))
-        #     return_data["loss"] = loss
-
-        # return return_data
-
     def load_weights(self, checkpoint_path):
         checkpoint = torch.load(checkpoint_path,
                                 map_location=torch.device("cpu"))
diff --git a/flagai/model/opt_model.py b/flagai/model/opt_model.py
@@ -265,43 +265,6 @@ def __init__(self, config, **kwargs):
         # self.config = config
         self.transformer = OPTStack(self.config)
 
-    # def forward(
-    #         self,
-    #         **data,
-    # ):
-    #     input_ids = data.get("input_ids", None)
-    #     # attention_mask = data.get("attention_mask", None)
-    #     # position_ids = data.get("position_ids", None)
-    #     labels = data.get("labels", None)
-    #     use_cache = data.get("use_cache", None)
-    #     output_attentions = data.get("output_attentions", None)
-    #     output_hidden_states = data.get("output_hidden_states", True)
-    #
-    #     transformer_outputs = self.transformer(
-    #         input_ids,
-    #         attention_mask=None,
-    #         position_ids=None,
-    #         use_cache=use_cache,
-    #         output_attentions=output_attentions,
-    #         output_hidden_states=output_hidden_states,
-    #     )
-    #     hidden_states = transformer_outputs
-    #
-    #     lm_logits = self.lm_head(hidden_states)
-    #
-    #     return_data = {"logits": lm_logits}
-    #     if labels is not None:
-    #         # Shift so that tokens < n predict n
-    #         shift_logits = lm_logits[..., :-1, :].contiguous()
-    #         shift_labels = labels[..., 1:].contiguous()
-    #         loss_fct = nn.CrossEntropyLoss()
-    #         loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-    #                         shift_labels.view(-1))
-    #         return_data["loss"] = loss
-    #
-    #     return return_data
-
-
     def load_weights(self, checkpoint_path):
         checkpoint = torch.load(checkpoint_path,
                                 map_location=torch.device("cpu"))
diff --git a/flagai/mp_tools.py b/flagai/mp_tools.py
@@ -219,8 +219,6 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t
             start = ratio * i
             end = ratio * (i + 1)
             d = torch.load(filenames[i], map_location='cpu')
-            if d.get("module", None) is None:
-                d["module"] = d
 
             for j in range(start, end):
                 d_new = {}
@@ -235,7 +233,11 @@ def change_pytorch_model_mp_from_1_to_n_new(model_name_brief, checkpoint: str, t
                             d_new[k] = None
                 d_new['module'] = {}
                 with torch.no_grad():
-                    for k, v in d['module'].items():
+
+                    if "module" in d:
+                        d = d["module"]
+
+                    for k, v in d.items():
                         assert len(v.shape) < 3
                         flag = 0
                         for keys in trans_keys: