outputs not matching non-flash case in MQA

mayank31398 · mayank31398 · commit befde517c208 · 2023-07-18T19:33:58.000+05:30
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -398,7 +398,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
                        key_layer.size(0))
 
         # [sq, b, np, hn] -> [b, np * sq, hn]
-        query_layer = query_layer.permute([1, 2, 0, 3]).reshape(bs, np * sq, -1)
+        query_layer = query_layer.transpose(0, 1).reshape(bs, np * sq, -1)
         # [sk, b, 1, hn] -> [b, hn, sk]
         key_layer = key_layer.squeeze(2).permute(1, 2, 0)
         # [sk, b, 1, hn] -> [sk, b * np, hn]
@@ -439,8 +439,8 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
                 key_layer,
                 beta=beta, alpha=(1.0 / self.norm_factor))
 
-        # change view to [b, np, sq, sk]
-        attention_scores = matmul_result.view(bs, np, sq, sk)
+        attention_scores = matmul_result.view(bs, sq, np, sk)
+        attention_mask = attention_mask.transpose(1, 2)
 
         # ===========================
         # Attention probs and dropout
@@ -482,15 +482,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
         context_layer = torch.bmm(attention_probs, value_layer)
 
         # change view [b, np, sq, hn]
-        context_layer = context_layer.view(bs, np, sq, -1)
-
-        # [b, np, sq, hn] --> [sq, b, np, hn]
-        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-
-        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
-        context_layer = context_layer.view(*new_context_layer_shape)
+        context_layer = context_layer.view(bs, sq, -1).transpose(0, 1)
 
         return context_layer
 
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
@@ -138,7 +138,7 @@ def check_message(msg):
     if hasattr (md, 'checkpoint_args'):
         # These are arguments that we are either changing, or cause problems for validation if they are set
         # Note that some of these deal with T5 so will need to be changed if we support T5.
-        args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'params_dtype',
+        args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype',
                         'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size',
                         'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion',
                         'sequence_parallel', 'async_tensor_model_parallel_allreduce',