diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py index 1f83f329..8f051376 100644 --- a/examples/benchmark/generation_speed.py +++ b/examples/benchmark/generation_speed.py @@ -176,7 +176,6 @@ def load_model_tokenizer( model_basename=model_basename, use_safetensors=use_safetensors, trust_remote_code=trust_remote_code, - warmup_triton=False, backend=backend, ) @@ -279,11 +278,6 @@ def main(): logger.info(f"model quantized: {model.quantized}") logger.info(f"quantize config: {model.quantize_config.to_dict()}") logger.info(f"model device map: {model.hf_device_map}") - - if args.backend == BACKEND.TRITON: - logger.info("warmup triton, this may take a while.") - model.warmup_triton() - logger.info("loading data") examples = load_data( tokenizer, diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 94074515..76277723 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -116,7 +116,6 @@ def from_quantized( model_basename: Optional[str] = None, use_safetensors: bool = True, trust_remote_code: bool = False, - warmup_triton: bool = False, # verify weight files matches predefined hash during loading # usage: hash_format:hash_value, example: md5:ugkdh232 # supports all hashlib hash methods @@ -136,7 +135,6 @@ def from_quantized( model_basename=model_basename, use_safetensors=use_safetensors, trust_remote_code=trust_remote_code, - warmup_triton=warmup_triton, verify_hash=verify_hash, **kwargs, ) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 3e080749..bb7b8cca 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -153,21 +153,18 @@ def quantize( self, calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], batch_size: int = 1, - autotune_warmup_after_quantized: bool = False, calibration_enable_gpu_cache: bool = True, ): if isinstance(self.quantize_config, AutoRoundQuantizeConfig): - self._quantize(calibration_dataset, batch_size, autotune_warmup_after_quantized, - calibration_enable_gpu_cache) + self._quantize(calibration_dataset, batch_size, calibration_enable_gpu_cache) else: with torch.inference_mode(): - self._quantize(calibration_dataset, batch_size, autotune_warmup_after_quantized, calibration_enable_gpu_cache) + self._quantize(calibration_dataset, batch_size, calibration_enable_gpu_cache) def _quantize( self, calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], batch_size: int = 1, - autotune_warmup_after_quantized: bool = False, calibration_enable_gpu_cache: bool = True, ): if self.quantized: @@ -551,7 +548,6 @@ def tmp(_, inp, out): # triton can support 2, 4, 8bits while exllama packer only supports 4bits backend=BACKEND.TRITON if not isinstance(self.quantize_config, AutoRoundQuantizeConfig) and self.quantize_config.format in [FORMAT.GPTQ, FORMAT.GPTQ_V2] and self.quantize_config.bits != 4 else BACKEND.AUTO, desc_act=self.quantize_config.desc_act, - warmup_triton=autotune_warmup_after_quantized, force_layer_back_to_cpu=force_layer_back_to_cpu, format=self.quantize_config.format, ) @@ -879,7 +875,6 @@ def from_quantized( model_basename: Optional[str] = None, use_safetensors: bool = True, trust_remote_code: bool = False, - warmup_triton: bool = False, format: Optional[FORMAT] = None, allow_unsafe_loading: bool = False, verify_hash: Optional[Union[str, List[str]]] = None, @@ -1248,12 +1243,6 @@ def skip(*args, **kwargs): model.eval() - # == step6: (optional) warmup triton == # - if backend == BACKEND.TRITON and warmup_triton: - from ..nn_modules.qlinear.qlinear_tritonv2 import TritonV2QuantLinear - - TritonV2QuantLinear.warmup(model, seqlen=model.seqlen) - return cls( model, quantized=True, @@ -1261,14 +1250,6 @@ def skip(*args, **kwargs): qlinear_kernel=qlinear_kernel, ) - def warmup_triton(self, enabled: bool = True): - if not enabled: - return - - from ..nn_modules.qlinear.qlinear_tritonv2 import TritonV2QuantLinear - - TritonV2QuantLinear.warmup(self.model, seqlen=self.model.seqlen) - def __getattr__(self, item): try: return super().__getattr__(item) diff --git a/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py index 2d8db692..ecb756d9 100644 --- a/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py +++ b/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py @@ -135,48 +135,5 @@ def forward(self, x): out = out + self.bias if self.bias is not None else out return out - @classmethod - def warmup(cls, model, transpose=False, seqlen=2048): - """ - Pre-tunes the quantized kernel - """ - from tqdm import tqdm - - kn_values = {} - - for _, m in model.named_modules(): - if not isinstance(m, cls): - continue - - k = m.infeatures - n = m.outfeatures - - if (k, n) not in kn_values: - kn_values[(k, n)] = ( - m.qweight, - m.scales, - m.qzeros, - m.g_idx, - m.bits, - m.maxq, - ) - - logger.info(f"Found {len(kn_values)} unique KN Linear values.") - logger.info("Warming up autotune cache ...") - with torch.no_grad(): - for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)): - m = 2**m - for (k, n), ( - qweight, - scales, - qzeros, - g_idx, - bits, - maxq, - ) in kn_values.items(): - a = torch.randn(m, k, dtype=torch.float16, device=model.device) - quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq) - del kn_values - __all__ = ["TritonV2QuantLinear"] diff --git a/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/gptqmodel/nn_modules/triton_utils/custom_autotune.py index 68f8c3c5..fde5ca2c 100644 --- a/gptqmodel/nn_modules/triton_utils/custom_autotune.py +++ b/gptqmodel/nn_modules/triton_utils/custom_autotune.py @@ -141,16 +141,7 @@ def prune_configs(self, kwargs): return pruned_configs def warmup(self, *args, **kwargs): - self.nargs = dict(zip(self.arg_names, args)) - for config in self.prune_configs(kwargs): - self.fn.warmup( - *args, - num_warps=config.num_warps, - num_stages=config.num_stages, - **kwargs, - **config.kwargs, - ) - self.nargs = None + pass def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False): diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 4bb33d70..a905afdf 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -258,7 +258,6 @@ def pack_model( format: str, desc_act=False, sym: bool = True, - warmup_triton: bool = False, force_layer_back_to_cpu: bool = False, ): QuantLinear = select_quant_linear_with_pack( @@ -313,11 +312,6 @@ def pack_model( logger.info("Model packed.") - if backend == BACKEND.TRITON and warmup_triton: - logger.warning( - "using autotune_warmup will move model to GPU, make sure you have enough VRAM to load the whole model." - ) - QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen) return QuantLinear def verify_model_hash(file_path: str, verify_hash: str): diff --git a/tests/test_triton.py b/tests/test_triton.py index cda7c713..ba0bf828 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -66,12 +66,9 @@ def get_model_and_tokenizer( model = GPTQModel.from_quantized( model_id, - disable_exllamav2=True, - disable_exllama=True, **model_kwargs, ) - model.warmup_triton() return model, tokenizer