feat(cache): SlidingWindowCache uses index_copy_ to avoid useless copy

tengomucho · tengomucho · commit 53e99d185cbf · 2024-07-10T10:21:43.000+02:00
Applying the same change done in StaticCache.
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -969,8 +969,11 @@ def update(
         k_out = k_out[:, :, indices]
         v_out = v_out[:, :, indices]
 
-        k_out[:, :, cache_position] = key_states
-        v_out[:, :, cache_position] = value_states
+        # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
+        # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
+        # operation, that avoids copies and uses less memory.
+        k_out.index_copy_(2, cache_position, key_states)
+        v_out.index_copy_(2, cache_position, value_states)
 
         # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment)
         self.key_cache[layer_idx].zero_()