Merge branch 'main' into trainer

huggingface · Oct 17, 2024 · 782d580 · 782d580
2 parents cebf5d6 + f4cb594
commit 782d580
Show file tree

Hide file tree

Showing 28 changed files with 470 additions and 148 deletions.
diff --git a/Makefile b/Makefile
@@ -105,6 +105,7 @@ slow_tests_diffusers: test_installs
 
 # Run text-generation non-regression tests
 slow_tests_text_generation_example: test_installs
+	BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
 	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
 	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
 

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
@@ -93,7 +93,7 @@ python3 run_pipeline.py \
 ```
 
 ### Inference with FP8
-Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
+Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch.
 
 More information on enabling FP8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
@@ -37,43 +37,21 @@
 
 
 def setup_quantization(model, args):
-    if os.getenv("USE_INC", "1") != "0":
-        try:
-            from neural_compressor.torch.quantization import FP8Config, convert, prepare
-        except ImportError:
-            raise ImportError(
-                "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
-            )
-
-        config = FP8Config.from_json_file(args.quant_config)
-        if config.measure:
-            model = prepare(model, config)
-        elif config.quantize:
-            model = convert(model, config)
-    else:
-        import habana_frameworks.torch.core as htcore
-        import habana_quantization_toolkit
+    from neural_compressor.torch.quantization import FP8Config, convert, prepare
 
-        habana_quantization_toolkit.prep_model(model)
-        htcore.hpu_initialize(model)
+    config = FP8Config.from_json_file(args.quant_config)
+    if config.measure:
+        model = prepare(model, config)
+    elif config.quantize:
+        model = convert(model, config)
 
     return model
 
 
 def finalize_quantization(model):
-    if os.getenv("USE_INC", "1") != "0":
-        try:
-            from neural_compressor.torch.quantization import finalize_calibration
-        except ImportError:
-            raise ImportError(
-                "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
-            )
-
-        finalize_calibration(model)
-    else:
-        import habana_quantization_toolkit
+    from neural_compressor.torch.quantization import finalize_calibration
 
-        habana_quantization_toolkit.finish_measurements(model)
+    finalize_calibration(model)
 
 
 def main():
@@ -151,7 +129,7 @@ def main():
 
     # set args.quant_config with env variable if it is set
     args.quant_config = os.getenv("QUANT_CONFIG", "")
-
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
     adapt_transformers_to_gaudi()
 
     model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type
@@ -227,6 +205,7 @@ def main():
 
     if args.quant_config:
         generator.model = setup_quantization(generator.model, args)
+        htcore.hpu_initialize(generator.model)
 
     # warm up
     for i in range(args.warmup):

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
@@ -868,7 +868,8 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi peft_poly_seq2seq_with_genera
     --per_device_eval_batch_size 4 \
     --bf16 \
     --use_hpu_graphs_for_inference \
-    --use_hpu_graphs_for_training
+    --use_hpu_graphs_for_training \
+    --trust_remote_code
 ```
 
 

diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
@@ -172,6 +172,10 @@ class ModelArguments:
             )
         },
     )
+    flash_attention_fp8: bool = field(
+        default=False,
+        metadata={"help": ("Whether to enable flash attention in FP8.")},
+    )
     use_fused_rope: bool = field(
         default=True,
         metadata={
@@ -509,6 +513,7 @@ def main():
         "trust_remote_code": True if model_args.trust_remote_code else None,
         "use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
         "token": model_args.token,
+        "flash_attention_fp8": model_args.flash_attention_fp8,
     }
     if model_args.config_name:
         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
@@ -705,6 +710,11 @@ def main():
             model.generation_config.use_flash_attention = True
             model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute
             model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask
+
+            if model_args.flash_attention_fp8:
+                import habana_frameworks.torch.hpu as hthpu
+
+                assert hthpu.get_device_name() == "GAUDI3", "Flash attention in FP8 is supported only on Gaudi3"
         if not model_args.use_fused_rope:
             model.generation_config.use_fused_rope = False
 

diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
@@ -375,6 +375,12 @@ def main():
         token=model_args.token,
     )
 
+    if training_args.do_train and training_args.use_compiled_autograd:
+        from habana_frameworks.torch.dynamo.compile_backend.experimental import enable_compiled_autograd
+
+        enable_compiled_autograd()
+        torch._C._set_autograd_fallback_mode("nothing")
+
     # Log on each process the small summary:
     mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
     logger.warning(

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
@@ -470,6 +470,10 @@ cards 0-3 and cards 4-7 will be unified in two different measurement files. All
 More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
 
 
+> [!NOTE]
+> unify_measurements.py does not support PCQ mode. (default: PTQ)
+
+
 
 ### CPU memory reduction on single card
 
@@ -502,7 +506,7 @@ python run_generation.py \
 
 ### Loading 4 Bit Checkpoints from Hugging Face
 
-You can load pre-quantized 4bit models with the argument `--load_quantized_model`.
+You can load pre-quantized 4bit models with the argument `--load_quantized_model_with_inc`.
 Currently, uint4 checkpoints and single device are supported.
 More information on enabling 4 bit inference in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UINT4.html.
@@ -524,7 +528,35 @@ python run_lm_eval.py \
 --attn_softmax_bf16 \
 --bucket_size=128 \
 --bucket_internal \
---load_quantized_model
+--load_quantized_model_with_inc
+```
+
+### Loading 4 Bit Checkpoints from Neural Compressor (INC)
+
+You can load a pre-quantized 4-bit checkpoint with the argument `--local_quantized_inc_model_path`, supplied with the original model with the argument `--model_name_or_path`.
+Currently, only uint4 checkpoints and single-device configurations are supported.
+**Note:** In this process, you can load a checkpoint that has been quantized using INC.
+More information on enabling 4-bit inference in SynapseAI is available here:
+https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UINT4.html?highlight=inference%20using%20int4#enabling-and-running-uint4-in-pytorch-models.
+
+Below is an example of loading a llama7b model with a 4bit checkpoint quantized in INC.
+Please note that the model checkpoint name is denoted as `<local_model_path_from_inc>`.
+Additionally, the following environment variables are used for performance optimizations and are planned to be removed in future versions:
+`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1`
+```bash
+SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 \
+python run_lm_eval.py \
+-o acc_load_uint4_model.txt \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--use_hpu_graphs \
+--use_kv_cache \
+--trim_logits \
+--batch_size 1 \
+--bf16 \
+--attn_softmax_bf16 \
+--bucket_size=128 \
+--bucket_internal \
+--local_quantized_inc_model_path <local_model_path_from_inc> \
 ```
 
 ### Using Habana Flash Attention
@@ -555,6 +587,37 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 
 For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#using-fused-sdpa).
 
+### Running with UINT4 weight quantization using AutoGPTQ
+
+
+Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
+Currently, the support is for UINT4 inference of pre-quantized models only.
+
+You can run a *UINT4 weight quantized* model using AutoGPTQ by setting the following environment variables:
+`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=true` before running the command,
+and by adding the argument `--load_quantized_model_with_autogptq`.
+
+***Note:***
+Setting the above environment variables improves performance. These variables will be removed in future releases.
+
+
+Here is an example to run a quantized model <quantized_gptq_model>:
+```bash
+SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false \
+ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
+--attn_softmax_bf16 \
+--model_name_or_path <quantized_gptq_model> \
+--use_hpu_graphs \
+--limit_hpu_graphs \
+--use_kv_cache \
+--bucket_size 128 \
+--bucket_internal \
+--trim_logits \
+--max_new_tokens 128 \
+--batch_size 1 \
+--bf16 \
+--load_quantized_model_with_autogptq
+```
 
 ## Language Model Evaluation Harness
 
@@ -574,6 +637,8 @@ First, you should install the requirements:
 pip install -r requirements_lm_eval.txt
 ```
 
+> [!NOTE]
+> If custom models on hub is being used, please set env variable HF_DATASETS_TRUST_REMOTE_CODE=true instead of arg --trust_remote_code with the installed lm_eval version and dependency datasets==2.21.0
 
 ### Examples
 

diff --git a/examples/text-generation/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json b/examples/text-generation/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json
@@ -3,7 +3,5 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure"
 }
diff --git a/examples/text-generation/quantization_config/maxabs_measure.json b/examples/text-generation/quantization_config/maxabs_measure.json
@@ -2,8 +2,5 @@
     "method": "HOOKS",
     "mode": "MEASURE",
     "observer": "maxabs",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
-    "dump_stats_path": "./hqt_output/measure",
-    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
-}
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json b/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json
@@ -3,7 +3,5 @@
     "mode": "MEASURE",
     "observer": "maxabs",
     "measure_exclude": "NONE",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure"
 }
diff --git a/examples/text-generation/quantization_config/maxabs_quant.json b/examples/text-generation/quantization_config/maxabs_quant.json
@@ -3,7 +3,5 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "maxabs_hw",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure"
 }
diff --git a/examples/text-generation/quantization_config/maxabs_quant_phi.json b/examples/text-generation/quantization_config/maxabs_quant_phi.json
@@ -3,7 +3,6 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "maxabs_hw",
-    "allowlist": {"types": [], "names":  []},
     "blocklist": {"types": [], "names":  [
         "matmul_qk",
         "matmul_av",

diff --git a/examples/text-generation/quantization_config/maxabs_quant_scalar_scales.json b/examples/text-generation/quantization_config/maxabs_quant_scalar_scales.json
@@ -0,0 +1,8 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "dump_stats_path": "./hqt_output/measure",
+    "scale_format": "scalar"
+}
diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json
@@ -3,7 +3,5 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "unit_scale",
-    "allowlist": {"types": [], "names":  []},
-    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure"
 }
diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py
@@ -79,9 +79,24 @@ def unify_measurements(
                 for i in range(0, len(max_inputs)):
                     max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
                 if max_outputs is not None:
-                    max_outputs = max(measurement_json[node_name]["outputs"], max_outputs)
+                    if isinstance(max_outputs[0], list):
+                        for i in range(0, len(max_outputs)):
+                            for j in range(0, len(max_outputs[i])):
+                                max_outputs[i][j] = max(
+                                    measurement_json[node_name]["outputs"][i][j], max_outputs[i][j]
+                                )
+                    else:
+                        for i in range(0, len(max_outputs)):
+                            max_outputs[i] = max(measurement_json[node_name]["outputs"][i], max_outputs[i])
                 if max_weight is not None:
-                    max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
+                    if isinstance(max_weight, dict):
+                        for key, values in max_weight.items():
+                            for i in range(0, len(values)):
+                                max_weight[key][i] = max(
+                                    measurement_json[node_name]["params"]["weight"][key][i], max_weight[key][i]
+                                )
+                    else:
+                        max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
         else:
             for measurement_json in measurements_jsons:
                 for i in range(0, len(max_inputs)):
@@ -99,9 +114,20 @@ def unify_measurements(
             for i in range(0, len(max_inputs)):
                 unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
             if max_outputs is not None:
-                unified_json["Nodes"][node_name]["outputs"] = max_outputs
+                if isinstance(max_outputs[0], list):
+                    for i in range(0, len(max_outputs)):
+                        for j in range(0, len(max_outputs[i])):
+                            unified_json["Nodes"][node_name]["outputs"][i][j] = max_outputs[i][j]
+                else:
+                    for i in range(0, len(max_outputs)):
+                        unified_json["Nodes"][node_name]["outputs"][i] = max_outputs[i]
             if max_weight is not None:
-                unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
+                if isinstance(max_weight, dict):
+                    for key, values in max_weight.items():
+                        for i in range(0, len(values)):
+                            unified_json["Nodes"][node_name]["params"]["weight"][key][i] = max_weight[key][i]
+                else:
+                    unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
         else:
             for i in range(0, len(max_inputs)):
                 for j in range(0, len(max_inputs[i])):

diff --git a/examples/text-generation/requirements_lm_eval.txt b/examples/text-generation/requirements_lm_eval.txt
@@ -1 +1,2 @@
 https://github.com/EleutherAI/lm-evaluation-harness/archive/0bf683b4e6a9df359b3156ba9ba8d62bdd47e0c0.zip
+datasets==2.21.0