huggingface · jiminha · Sep 27, 2024 · Sep 28, 2024 · libinta · Oct 2, 2024
@@ -705,7 +705,6 @@ def _prepare_cache_for_generation(
         Changes:
         - change the default from DynamicCache to tuples
         """
-
         cache_name = "past_key_values" if "mamba" not in self.__class__.__name__.lower() else "cache_params"
         requires_cross_attention_cache = (
             self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
@@ -1801,7 +1800,7 @@ def _contrastive_search(
                         logit_for_next_step = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
                 else:
                     # .float() is needed to retain precision for later logits manipulations
-                    logit_for_next_step = outputs.logits[:, -1, :].float()
+                    logit_for_next_step = outputs.logits[:, -1, :]
 
                 model_kwargs = self._update_model_kwargs_for_generation(
                     outputs,
@@ -1968,7 +1967,7 @@ def _contrastive_search(
                 full_hidden_states = outputs.hidden_states
 
             # .float() is needed to retain precision for later logits manipulations
-            logits = outputs.logits[:, -1, :].float()
+            logits = outputs.logits[:, -1, :]
             context_hidden = last_hidden_states.repeat_interleave(top_k, dim=0)
 
             # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
@@ -2355,7 +2354,7 @@ def _sample(
             if token_idx is not None and outputs.logits.shape[-2] > 1:
                 # case1 (w/o KV caching): outputs.logits.shape: [batch_size, max_length, vocab_size]
                 if self.config.is_encoder_decoder:
-                    next_token_logits = outputs.logits[:, token_idx - 1, :].float()
+                    next_token_logits = outputs.logits[:, token_idx - 1, :]
                     next_token_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)
                 else:
                     if model_kwargs.get("num_virtual_tokens", 0) > 0:
@@ -2370,7 +2369,7 @@ def _sample(
                     next_token_scores = logits_processor(input_ids, next_token_logits)
             else:
                 # .float() is needed to retain precision for later logits manipulations
-                next_token_logits = outputs.logits[:, -1, :].float()
+                next_token_logits = outputs.logits[:, -1, :]
                 if token_idx is not None and self.config.is_encoder_decoder:
                     # case2 (with KV caching): outputs.logits.shape: [batch_size, 1, vocab_size]
                     next_token_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)
@@ -2814,7 +2813,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                 else:
                     next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
             else:
-                next_token_logits = outputs.logits[:, -1, :].float()
+                next_token_logits = outputs.logits[:, -1, :]
 
             next_token_scores = torch.nn.functional.log_softmax(
                 next_token_logits, dim=-1
@@ -3260,7 +3259,7 @@ def _constrained_beam_search(
                 else:
                     next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
             else:
-                next_token_logits = outputs.logits[:, -1, :].float()
+                next_token_logits = outputs.logits[:, -1, :]
 
             next_token_scores = torch.nn.functional.log_softmax(
                 next_token_logits, dim=-1
@@ -3580,8 +3579,7 @@ def _assisted_decoding(
 
             # 2.3. Process the new logits
             # .float() is needed to retain precision for later logits manipulations
-            new_logits = outputs.logits[:, -candidate_length - 1 :].float()  # excludes the input prompt if present
-            next_token_logits = new_logits.clone()
+            new_logits = outputs.logits[:, -candidate_length - 1 :]
             if len(logits_processor) > 0:
                 for i in range(candidate_length + 1):
                     new_logits[:, i, :] = logits_processor(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])