get_seq_length in StaticCacheXLA uses out-of-place index_select operation.

huzama · huzama · commit ed93bf8e1dc7 · 2024-06-10T15:28:58.000+09:00
This is necessary to for XLA as tensors are not materilzed yet
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1020,7 +1020,13 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
         # limit the check to the first batch member and head dimension.
         # TODO: deprecate this function in favor of `cache_position`
-        raise NotImplementedError("StaticCacheXLA is not implemented yet")
+        key_cache = self.key_cache[layer_idx]
+        device = key_cache.device
+
+        item = key_cache.index_select(0, torch.tensor(0, device=device))
+        head = item.index_select(1, torch.tensor(0, device=device))
+
+        return head.any(dim=-1).sum()
 
     def get_max_length(self) -> Optional[int]:
         """Returns the maximum sequence length of the cached states."""