huggingface
diff --git a/‎src/transformers/configuration_utils.py‎
Lines changed: 0 additions & 4 deletions b/‎src/transformers/configuration_utils.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/transformers/integrations/bitsandbytes.py‎
Lines changed: 14 additions & 12 deletions b/‎src/transformers/integrations/bitsandbytes.py‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎src/transformers/integrations/flash_attention.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/integrations/flash_attention.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/modeling_utils.py‎
Lines changed: 23 additions & 19 deletions b/‎src/transformers/modeling_utils.py‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎src/transformers/models/diffllama/modeling_diffllama.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/diffllama/modeling_diffllama.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/diffllama/modular_diffllama.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/diffllama/modular_diffllama.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/falcon/modeling_falcon.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/falcon/modeling_falcon.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/models/falcon_mamba/modeling_falcon_mamba.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/falcon_mamba/modeling_falcon_mamba.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/falcon_mamba/modular_falcon_mamba.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/falcon_mamba/modular_falcon_mamba.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/gpt_neo/modeling_gpt_neo.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/models/gpt_neo/modeling_gpt_neo.py‎
Lines changed: 2 additions & 2 deletions
@@ -1019,10 +1019,6 @@ def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None:
         Checks and removes if there are any keys in the dict that should not be serialized when saving the config.
         Runs recursive check on the dict, to remove from all sub configs.
         """
-        if hasattr(self, "quantization_config"):
-            # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = d.pop("_pre_quantization_dtype", None)
-
         if "_auto_class" in d:
             del d["_auto_class"]
         if "_output_attentions" in d:
 
@@ -233,7 +233,7 @@ def replace_with_bnb_linear(
 
 
 # Copied from PEFT: https://github.com/huggingface/peft/blob/47b3712898539569c02ec5b3ed4a6c36811331a1/src/peft/utils/integrations.py#L41
-def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", state=None):
+def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
     """
     Helper function to dequantize 4bit or 8bit bnb weights.
 
@@ -248,10 +248,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", st
 
     if cls_name == "Params4bit":
         output_tensor = bnb.functional.dequantize_4bit(weight.data, weight.quant_state)
-        logger.warning_once(
-            f"The model is going to be dequantized in {output_tensor.dtype} - if you want to upcast it to another dtype, make sure to pass the desired dtype when quantizing the model through `bnb_4bit_quant_type` argument of `BitsAndBytesConfig`"
-        )
-        return output_tensor.to(dtype)
+        return output_tensor
 
     if state.SCB is None:
         state.SCB = weight.SCB
@@ -263,7 +260,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", st
         # Multiply by (scale/127) to dequantize.
         dequantized = weight.data * state.SCB.view(-1, 1) * 7.874015718698502e-3
 
-    return dequantized.to(dtype)
+    return dequantized
 
 
 def _create_accelerate_new_hook(old_hook):
@@ -283,10 +280,7 @@ def _create_accelerate_new_hook(old_hook):
     return new_hook
 
 
-def dequantize_and_replace(
-    model,
-    quantization_config=None,
-):
+def dequantize_and_replace(model, quantization_config=None, dtype=None):
     """
     Converts a quantized model into its dequantized original version. The newly converted model will have
     some performance drop compared to the original model before quantization - use it only for specific usecases
@@ -297,14 +291,22 @@ def dequantize_and_replace(
     quant_method = quantization_config.quantization_method()
 
     target_cls = bnb.nn.Linear8bitLt if quant_method == "llm_int8" else bnb.nn.Linear4bit
-
     for module_name, module in model.named_modules():
         if isinstance(module, target_cls):
             with init_empty_weights():
                 bias = getattr(module, "bias", None)
                 new_module = torch.nn.Linear(module.in_features, module.out_features, bias=bias is not None)
             state = module.state if quant_method == "llm_int8" else None
-            new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, model.dtype, state))
+            new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, state))
+            weight = dequantize_bnb_weight(module.weight, state)
+            if dtype is None:
+                logger.warning_once(
+                    f"The modules are dequantized in {weight.dtype}. If you want to change the dtype, please specify `dtype` in `dequantize`. "
+                )
+            else:
+                logger.warning_once(f"The modules are dequantized in {weight.dtype} and casted to {dtype}.")
+                weight = weight.to(dtype)
+            new_module.weight = torch.nn.Parameter(weight)
             if bias is not None:
                 new_module.bias = bias
             if hasattr(module, "_hf_hook"):
 
@@ -20,8 +20,8 @@ def get_target_dtype(query: torch.Tensor, module: torch.nn.Module) -> torch.dtyp
                 else torch.get_autocast_gpu_dtype()
             )
         # Handle the case where the model is quantized
-        elif hasattr(module.config, "_pre_quantization_dtype"):
-            return module.config._pre_quantization_dtype
+        elif hasattr(module.config, "quantization_config"):
+            return module.config.dtype
         else:
             return next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype
     return None
 
@@ -792,6 +792,7 @@ def _get_dtype(
     sharded_metadata: Optional[dict],
     state_dict: Optional[dict],
     weights_only: bool,
+    hf_quantizer: Optional[HfQuantizer] = None,
 ) -> tuple[PreTrainedConfig, torch.dtype]:
     """Find the correct `dtype` to use based on provided arguments. Also update the `config` based on the
     inferred dtype. We do the following:
@@ -840,6 +841,9 @@ def _get_dtype(
         # set torch.get_default_dtype() (usually fp32) as the default dtype if `None` is provided
         dtype = torch.get_default_dtype()
 
+    if hf_quantizer is not None:
+        hf_quantizer.update_dtype(dtype)
+
     # Get the main dtype
     if isinstance(dtype, dict):
         main_dtype = dtype.get("", torch.get_default_dtype())
@@ -1433,7 +1437,7 @@ def tp_plan(self, plan: dict[str, str] | None):
     def pp_plan(self, plan: dict[str, tuple[str, str]]):
         self._pp_plan = plan
 
-    def dequantize(self):
+    def dequantize(self, dtype=None):
         """
         Potentially dequantize the model in case it has been quantized by a quantization method that support
         dequantization.
@@ -1443,7 +1447,7 @@ def dequantize(self):
         if hf_quantizer is None:
             raise ValueError("You need to first quantize your model in order to dequantize it")
 
-        return hf_quantizer.dequantize(self)
+        return hf_quantizer.dequantize(self, dtype=dtype)
 
     def _backward_compatibility_gradient_checkpointing(self):
         if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
@@ -3875,8 +3879,8 @@ def from_pretrained(
         if "attn_implementation" in kwargs:
             config._attn_implementation = kwargs.pop("attn_implementation")
 
-        hf_quantizer, config, dtype, device_map = get_hf_quantizer(
-            config, quantization_config, dtype, device_map, weights_only, user_agent
+        hf_quantizer, config, device_map = get_hf_quantizer(
+            config, quantization_config, device_map, weights_only, user_agent
         )
 
         if gguf_file:
@@ -3923,7 +3927,9 @@ def from_pretrained(
             ]
 
         # Find the correct dtype based on current state
-        config, dtype = _get_dtype(dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only)
+        config, dtype = _get_dtype(
+            dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only, hf_quantizer
+        )
 
         config.name_or_path = pretrained_model_name_or_path
         model_init_context = cls.get_init_context(dtype, is_quantized, _is_ds_init_called)
@@ -3932,22 +3938,18 @@ def from_pretrained(
             # Let's make sure we don't run the init function of buffer modules
             model = cls(config, *model_args, **model_kwargs)
 
+            if hf_quantizer is not None:  # replace module with quantized modules (does not touch weights)
+                hf_quantizer.preprocess_model(
+                    model=model,
+                    dtype=dtype,
+                    device_map=device_map,
+                    checkpoint_files=checkpoint_files,
+                    use_kernels=use_kernels,
+                )
+
         # Obtain the weight conversion mapping for this model if any are registered
         weight_conversions = get_model_conversion_mapping(model, key_mapping, hf_quantizer)
 
-        # make sure we use the model's config since the __init__ call might have copied it
-        config = model.config
-
-        if hf_quantizer is not None:  # replace module with quantized modules (does not touch weights)
-            hf_quantizer.preprocess_model(
-                model=model,
-                device_map=device_map,
-                keep_in_fp32_modules=model._keep_in_fp32_modules,  # TODO prob no longer needed?
-                config=config,
-                checkpoint_files=checkpoint_files,
-                use_kernels=use_kernels,
-            )
-
         if _torch_distributed_available and device_mesh is not None:  # add hooks to nn.Modules: no weights
             model = distribute_model(model, tp_plan, distributed_config, device_mesh, tp_size)
 
@@ -3994,7 +3996,9 @@ def from_pretrained(
 
         if hf_quantizer is not None:
             model.hf_quantizer = hf_quantizer
-            hf_quantizer.postprocess_model(model, config=config)  # usually a no-op but sometimes needed
+            hf_quantizer.postprocess_model(
+                model
+            )  # usually a no-op but sometimes needed, e.g to remove the quant config when dequantizing
 
         if _adapter_model_path is not None:
             adapter_kwargs["key_mapping"] = key_mapping
 
@@ -361,8 +361,8 @@ def forward(
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.q_proj.weight.dtype
 
 
@@ -236,8 +236,8 @@ def forward(
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.q_proj.weight.dtype
 
 
@@ -521,8 +521,8 @@ def forward(
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.query_key_value.weight.dtype
 
 
@@ -345,7 +345,7 @@ def cuda_kernels_forward(
 
             # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
             # at the price of a small overhead.
-            if hasattr(self.config, "_pre_quantization_dtype"):
+            if hasattr(self.config, "quantization_config"):
                 discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
             else:
                 discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
 
@@ -357,7 +357,7 @@ def cuda_kernels_forward(
 
             # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
             # at the price of a small overhead.
-            if hasattr(self.config, "_pre_quantization_dtype"):
+            if hasattr(self.config, "quantization_config"):
                 discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
             else:
                 discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
 
@@ -237,8 +237,8 @@ def forward(
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.q_proj.weight.dtype