Add customized static cache implementation (#4490)

helunwencser · facebook-github-bot · commit 5b37524987b8 · 2024-08-01T01:41:46.000-07:00
Summary: Pull Request resolved: #4490 imported-using-ghimport Test Plan: Imported from OSS Reviewed By: iseeyuan Differential Revision: D60554455 Pulled By: helunwencser fbshipit-source-id: defc2953afb265b5e21b2fa540c3b1eb2e90d0a8
diff --git a/examples/models/phi-3-mini/__init__.py b/examples/models/phi-3-mini/__init__.py
diff --git a/examples/models/phi-3-mini/eager.py b/examples/models/phi-3-mini/eager.py
@@ -14,6 +14,8 @@
 
 from transformers import AutoTokenizer, Phi3ForCausalLM
 
+from .static_cache import ETStaticCache
+
 end_of_text_token = 32000
 
 
@@ -40,7 +42,18 @@ def _generate_token(args, model, prompt_tokens):
 def _generate_token_with_kv_cache(args, model, prompt_tokens):
     print("Generating tokens:", end="", flush=True)
 
-    result = model.forward(input_ids=prompt_tokens, use_cache=True, return_dict=True)
+    result = model.forward(
+        input_ids=prompt_tokens,
+        use_cache=True,
+        return_dict=True,
+        past_key_values=ETStaticCache(
+            model.config,
+            prompt_tokens.shape[0],
+            args.seq_len + prompt_tokens.shape[-1],
+            device=model.device,
+            dtype=model.dtype,
+        ),
+    )
 
     current_token = torch.argmax(result.logits[:, -1, :], dim=-1).item()
     current_key_value = result.past_key_values
diff --git a/examples/models/phi-3-mini/static_cache.py b/examples/models/phi-3-mini/static_cache.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Optional
+
+import torch
+from transformers import PretrainedConfig, StaticCache
+
+
+class ETStaticCache(StaticCache):
+    """
+    A customized static cache implementation, which overrides a few methods to make it exportable to ExecuTorch.
+    This can be removed once transformers supports static cache for Phi3 properly.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        max_batch_size: int,
+        max_cache_len: int,
+        device,
+        dtype=torch.float32,
+    ) -> None:
+        super().__init__(
+            config=config,
+            max_batch_size=max_batch_size,
+            max_cache_len=max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum().item()
+
+    def get_usable_length(
+        self, new_seq_length: int, layer_idx: Optional[int] = 0
+    ) -> int:
+        return self.get_seq_length(layer_idx)