Use Index_copy method to update static cache inplace and avoid recompilation during each iteration in XLA

huzama · huzama · commit 1ad0a9af4206 · 2024-05-30T12:19:01.000+09:00
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -7,7 +7,7 @@
 import torch
 
 from .configuration_utils import PretrainedConfig
-from .utils import is_hqq_available, is_quanto_available, logging
+from .utils import is_hqq_available, is_quanto_available, logging, is_torch_xla_available
 
 if is_quanto_available():
     from quanto import QBitsTensor, qint2, qint4
@@ -791,6 +791,23 @@ def update(
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
 
+        if is_torch_xla_available(): # If torch_xla is available, do out-of-place operation on KV_Cache and create a new list
+            k_out = k_out.index_copy(2, cache_position, key_states)
+            v_out = v_out.index_copy(2, cache_position, value_states)
+
+            updated_key_cache = [
+                k_out if i == layer_idx else self.key_cache[i] for i in range(len(self.key_cache))
+            ]
+
+            updated_value_cache = [
+                v_out if i == layer_idx else self.value_cache[i] for i in range(len(self.value_cache))
+            ]
+
+            self.key_cache = updated_key_cache
+            self.value_cache = updated_value_cache
+
+            return k_out, v_out
+
         k_out.index_copy_(2, cache_position, key_states)
         v_out.index_copy_(2, cache_position, value_states)