Hotfixing qwen2 and starcoder2 (which also get clamping). (huggingfac…

…e#2167)
yuanwu2017 · Sep 24, 2024 · d580215 · d580215
1 parent bc5a792
commit d580215
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -368,7 +368,7 @@ def forward(
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = torch.clamp(input_lengths, max=self.max_past_tensor)
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,

diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -534,7 +534,7 @@ def forward(
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = torch.clamp(input_lengths, max=self.max_past_tensor)
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,