fix vllmkvcache

Yi4Liu · Yi4Liu · commit 7c0a3e20f48e · 2025-02-25T08:32:15.000+02:00
Change-Id: I2916bb6d9c1c6b70be115d6d2b78959a0681f63f
Signed-off-by: Yi Liu &lt;yiliu4@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -108,9 +108,9 @@ def prepare_model(model, mod_list=None):
         d_shapes = None
     gmod_list.extend(mod_list)
     generate_model_info(model)
-    logger.info(f"generated model info")  
-    for mod, name in parent_child_mod_dict.items():
-        logger.info(f"mod: {mod}, name: {name}")
+    # logger.info(f"generated model info")  
+    # for mod, name in parent_child_mod_dict.items():
+    #     logger.info(f"mod: {mod}, name: {name}")
     register_patched_measure_modules(model, mod_list, observer_class, d_shapes)
 
 
@@ -160,7 +160,7 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                 )
                 logger.info(f"Patching measure module {name} {mod.__class__} ")
                 pmod = patch_module_measure(mod, mod_extra_config, mod_default_dict)
-                logger.info(f"Pacthed module pmod: {pmod}")
+                # logger.info(f"Pacthed module pmod: {pmod}")
                 if pmod._mod_extra_config:
                     for param_name in pmod._mod_extra_config.params:
                         param = getattr(pmod, param_name)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -878,26 +878,43 @@ def forward_qdq(self, input, *args, **kwargs):
         output_cache = self.orig_mod(qinput, *args, **kwargs)
         return output_cache
 
-    def forward_quant(self, input, *args, **kwargs):
-        qinput = self.quant_input(input)
-        output_cache = self.orig_mod(qinput, *args, **kwargs)
-        return self.dequant_output(output_cache)
+    # def forward_quant(self, input, *args, **kwargs):
+    #     qinput = self.quant_input(input)
+    #     output_cache = self.orig_mod(qinput, *args, **kwargs)
+    #     return self.dequant_output(output_cache)
 
     def forward_measure(self, input, *args, **kwargs):
         measure_input((input), self._mod_extra_config.inputs)
         output_cache = self.orig_mod(input, *args, **kwargs)
         measure_output((output_cache), self._mod_extra_config.outputs)
         return output_cache
 
-    def fetch_from_cache(self, cache, blocks, permutations=None):
-        quant_cache = self.quant_input(cache)
+    # def fetch_from_cache(self, cache, blocks, permutations=None):
+    #     # quant_cache = self.quant_input(cache)
+    #     quant_cache = cache
+    #     if permutations:
+    #         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
+    #         for i in range(len(output_cache)):
+    #             output_cache[i] = self.dequant_output(output_cache[i])
+    #         return output_cache
+    #     output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
+    #     return self.dequant_output(output_cache)
+    
+    def forward_quant(self, input, *args, **kwargs):
+        qinput = self.quant_input(input)
+        return self.orig_mod(qinput, *args, **kwargs)
+
+    def fetch_from_cache(self, quant_cache, blocks, permutations=None):
         if permutations:
             output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
             for i in range(len(output_cache)):
                 output_cache[i] = self.dequant_output(output_cache[i])
             return output_cache
         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
         return self.dequant_output(output_cache)
+    
+    def extra_repr(self) -> str:
+        return f"PatchedVLLMKVCache"
 
 def init_conv(instance, mod_extra_config):
     if instance.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]: