quic · asmigosw · Jul 3, 2025
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 
+import os
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
@@ -23,6 +24,19 @@
 )
 
 
+class CacheManager:
+    """
+    A class to manage the cache for the QEfficient model. It provides methods to create, update, and read from the cache.
+    """
+
+    def cache_manager(config, past_key_values):
+        is_dynamic = os.getenv("DYNAMIC_CACHE", "False")
+        if is_dynamic.lower() == "true":
+            return QEffDynamicCache.from_legacy_cache(past_key_values)
+        else:
+            return QEffHybridChunkedCache.from_legacy_cache(config, past_key_values)
+
+
 class QEffDynamicCache(DynamicCache):
     """
     A cache that grows dynamically as more tokens are generated. This is the default for generative models.

@@ -32,7 +32,7 @@
     repeat_kv,
 )
 
-from QEfficient.transformers.cache_utils import QEffHybridChunkedCache
+from QEfficient.transformers.cache_utils import CacheManager
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils import constants
 from QEfficient.utils._utils import IOInfo
@@ -638,7 +638,7 @@ def forward(
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = QEffHybridChunkedCache.from_legacy_cache(self.config, past_key_values)
+            past_key_values = CacheManager.cache_manager(self.config, past_key_values)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0