huggingface · zRzRzRzRzRzRzR · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
@@ -61,7 +61,6 @@ def _config_to_kwargs(args):
     return common_kwargs
 
 
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
@@ -130,7 +129,6 @@ def forward(self, max_seq_len, offset=0):
         )
 
 
-
 def split_tensor_along_last_dim(
         tensor: torch.Tensor,
         num_partitions: int,
@@ -269,7 +267,6 @@ def forward(
         # adjust key and value for inference
         if past_key_value is not None:
             key_layer, value_layer = past_key_value.update(key_layer, value_layer, self.layer_number - 1)
-
         if self.multi_query_attention:
             key_layer = key_layer.unsqueeze(2)
             key_layer = key_layer.expand(
@@ -285,7 +282,6 @@ def forward(
             value_layer = value_layer.contiguous().view(
                 value_layer.size()[:1] + (self.num_attention_heads_per_partition,) + value_layer.size()[3:]
             )
-
         # ==================================
         # core attention computation
         # ==================================
@@ -456,7 +452,6 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
         # [b, sq, np, hn] --> [b, sq, hp]
         new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.reshape(*new_context_layer_shape)
-
         return context_layer
 
 
@@ -590,13 +585,19 @@ class GLMSdpaAttention(GLMAttention):
 
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
         if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                             is_causal=True,
-                                                                             dropout_p=self.config.attention_dropout if self.training else 0.0)
+            context_layer = torch.nn.functional.scaled_dot_product_attention(
+                query_layer,
+                key_layer,
+                value_layer,
+                is_causal=True,
+                dropout_p=self.config.attention_dropout if self.training else 0.0)
         else:
-            context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                             attention_mask,
-                                                                             dropout_p=self.config.attention_dropout if self.training else 0.0)
+            context_layer = torch.nn.functional.scaled_dot_product_attention(
+                query_layer,
+                key_layer,
+                value_layer,
+                attention_mask,
+                dropout_p=self.config.attention_dropout if self.training else 0.0)
         context_layer = context_layer.transpose(1, 2).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.reshape(*new_context_layer_shape)

diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
@@ -17,14 +17,12 @@
 import gc
 import tempfile
 import unittest
-from parameterized import parameterized
 
 import pytest
 
-from transformers import AutoTokenizer, GLMConfig, is_torch_available, set_seed
+from transformers import AutoTokenizer, GLMConfig, is_torch_available
 from transformers.testing_utils import (
     backend_empty_cache,
-    require_bitsandbytes,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -60,7 +58,7 @@ def __init__(
             use_token_type_ids=True,
             use_labels=True,
             vocab_size=99,
-            hidden_size=32,
+            hidden_size=8,
             num_hidden_layers=2,
             num_attention_heads=4,
             num_key_value_heads=2,
@@ -394,61 +392,61 @@ def test_GLM_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="GLM buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GLM uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-    @unittest.skip(reason="SQRBound is known to have issues with gc")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def _check_attentions_for_generate(self, *args, **kwargs):
-        return True  # Model does not return attention
-
-    @unittest.skip(reason="Past key values are not returned")
-    def test_prompt_lookup_decoding_matches_greedy_search(self):
-        pass
-
-    @unittest.skip(reason="Past key values are not returned")
-    def test_model_parallelism(self):
-        pass
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
 
-    @unittest.skip(reason="Past key values are not returned")
-    def test_model_parallel_beam_search(self):
-        pass
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+
+            ## GLM block start with id 1 not 0
+            self.assertEqual(len(hidden_states), expected_num_layers + 1)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers + 1)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
 
-    def _check_past_key_values_for_generate(self, *args, **kwargs):
-        return True
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-    @unittest.skip(reason="Rely on `past_key_values` to crop the assistant pkv. Not supported")
-    def test_assisted_decoding_matches_greedy_search(self):
-        pass
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip(reason="Relies on `past_key_values` returned by the model. Not supported with recurrent GLM")
-    def test_assisted_decoding_sample(self):
-        pass
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
 
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
+            check_hidden_states_output(inputs_dict, config, model_class)
 
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
 
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
@@ -483,7 +481,6 @@ def test_flash_attn_2_generate_padding_right(self):
                         dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
                     )
 
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
@@ -529,15 +526,13 @@ def test_flash_attn_2_generate_use_cache(self):
                     use_cache=True,
                 )
 
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         self.skipTest(reason="GLM flash attention does not support right padding")
 
-
     @slow
     @require_torch
     class GLMIntegrationTest(unittest.TestCase):
@@ -584,3 +579,7 @@ def test_glm_instruct_generation(self):
                 "[gMASK] <sop> <|system|> \nYou are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. <|user|> \nTell me the answer of 1 plus 1? <|assistant|> \nThe answer to 1 plus 1 is 2. <|user|>"
             ]
             self.assertListEqual(output_text, EXPECTED_OUTPUT)
+
+    @unittest.skip(reason="Gemma uses GQA on all models so the KV cache is a non standard format")
+    def test_past_key_values_format(self):
+        pass