From 5bdf75105e721dd288ddb6e5f6ebbf0b5295c31d Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 12 Jun 2024 14:03:37 +0800 Subject: [PATCH] [LLM] Fix Qwen2 (#8584) * fix output_router_logits * fix with __future__ --- paddlenlp/transformers/qwen2/modeling.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py index 6cc4b83a359a..c6dac689382b 100644 --- a/paddlenlp/transformers/qwen2/modeling.py +++ b/paddlenlp/transformers/qwen2/modeling.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Paddle Qwen2 model.""" +from __future__ import annotations import math import warnings @@ -187,11 +188,11 @@ def scaled_dot_product_attention( else: # [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim] query_states = paddle.transpose(query_states, [0, 2, 1, 3]) - # merge with the next tranpose + # merge with the next transpose key_states = paddle.transpose(key_states, [0, 2, 1, 3]) value_states = paddle.transpose(value_states, [0, 2, 1, 3]) - # matmul and devide by sqrt(head_dim) + # matmul and divide by sqrt(head_dim) attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])) if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]: @@ -1127,7 +1128,7 @@ def forward(self, prediction_scores, masked_lm_labels): if self.enable_parallel_cross_entropy: if prediction_scores.shape[-1] == self.config.vocab_size: warnings.warn( - f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}" + f"enable_parallel_cross_entropy, the vocab_size should be splitted: {prediction_scores.shape[-1]}, {self.config.vocab_size}" ) self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) @@ -1202,14 +1203,7 @@ def get_decoder(self): return self.qwen2 def prepare_inputs_for_generation( - self, - input_ids, - use_cache=False, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - output_router_logits=False, - **kwargs + self, input_ids, use_cache=False, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs ): batch_size, seq_length = input_ids.shape position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length))) @@ -1230,7 +1224,6 @@ def prepare_inputs_for_generation( "past_key_values": past_key_values, "use_cache": use_cache, "attention_mask": attention_mask, - "output_router_logits": output_router_logits, } ) return model_inputs @@ -1325,7 +1318,7 @@ def forward( hidden_states = outputs[0] # if labels is None,means we need full output, instead of tensor_parallel_output - # tensor_parallel_output is togather with ParallelCrossEntropy + # tensor_parallel_output is together with ParallelCrossEntropy tensor_parallel_output = ( self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1 )