Skip to content

Commit

Permalink
Merge branch 'main' into trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
yafshar committed Oct 17, 2024
2 parents cebf5d6 + f4cb594 commit 782d580
Show file tree
Hide file tree
Showing 28 changed files with 470 additions and 148 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ slow_tests_diffusers: test_installs

# Run text-generation non-regression tests
slow_tests_text_generation_example: test_installs
BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git
python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)

Expand Down
2 changes: 1 addition & 1 deletion examples/image-to-text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ python3 run_pipeline.py \
```

### Inference with FP8
Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch.

More information on enabling FP8 in SynapseAI is available here:
https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
Expand Down
41 changes: 10 additions & 31 deletions examples/image-to-text/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,43 +37,21 @@


def setup_quantization(model, args):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import FP8Config, convert, prepare
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
elif config.quantize:
model = convert(model, config)
else:
import habana_frameworks.torch.core as htcore
import habana_quantization_toolkit
from neural_compressor.torch.quantization import FP8Config, convert, prepare

habana_quantization_toolkit.prep_model(model)
htcore.hpu_initialize(model)
config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
elif config.quantize:
model = convert(model, config)

return model


def finalize_quantization(model):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import finalize_calibration
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

finalize_calibration(model)
else:
import habana_quantization_toolkit
from neural_compressor.torch.quantization import finalize_calibration

habana_quantization_toolkit.finish_measurements(model)
finalize_calibration(model)


def main():
Expand Down Expand Up @@ -151,7 +129,7 @@ def main():

# set args.quant_config with env variable if it is set
args.quant_config = os.getenv("QUANT_CONFIG", "")

os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
adapt_transformers_to_gaudi()

model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type
Expand Down Expand Up @@ -227,6 +205,7 @@ def main():

if args.quant_config:
generator.model = setup_quantization(generator.model, args)
htcore.hpu_initialize(generator.model)

# warm up
for i in range(args.warmup):
Expand Down
3 changes: 2 additions & 1 deletion examples/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,8 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi peft_poly_seq2seq_with_genera
--per_device_eval_batch_size 4 \
--bf16 \
--use_hpu_graphs_for_inference \
--use_hpu_graphs_for_training
--use_hpu_graphs_for_training \
--trust_remote_code
```


Expand Down
10 changes: 10 additions & 0 deletions examples/language-modeling/run_lora_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,10 @@ class ModelArguments:
)
},
)
flash_attention_fp8: bool = field(
default=False,
metadata={"help": ("Whether to enable flash attention in FP8.")},
)
use_fused_rope: bool = field(
default=True,
metadata={
Expand Down Expand Up @@ -509,6 +513,7 @@ def main():
"trust_remote_code": True if model_args.trust_remote_code else None,
"use_cache": False if training_args.gradient_checkpointing else model_args.use_cache,
"token": model_args.token,
"flash_attention_fp8": model_args.flash_attention_fp8,
}
if model_args.config_name:
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
Expand Down Expand Up @@ -705,6 +710,11 @@ def main():
model.generation_config.use_flash_attention = True
model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute
model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask

if model_args.flash_attention_fp8:
import habana_frameworks.torch.hpu as hthpu

assert hthpu.get_device_name() == "GAUDI3", "Flash attention in FP8 is supported only on Gaudi3"
if not model_args.use_fused_rope:
model.generation_config.use_fused_rope = False

Expand Down
6 changes: 6 additions & 0 deletions examples/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,12 @@ def main():
token=model_args.token,
)

if training_args.do_train and training_args.use_compiled_autograd:
from habana_frameworks.torch.dynamo.compile_backend.experimental import enable_compiled_autograd

enable_compiled_autograd()
torch._C._set_autograd_fallback_mode("nothing")

# Log on each process the small summary:
mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast
logger.warning(
Expand Down
69 changes: 67 additions & 2 deletions examples/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,10 @@ cards 0-3 and cards 4-7 will be unified in two different measurement files. All
More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html


> [!NOTE]
> unify_measurements.py does not support PCQ mode. (default: PTQ)


### CPU memory reduction on single card

Expand Down Expand Up @@ -502,7 +506,7 @@ python run_generation.py \

### Loading 4 Bit Checkpoints from Hugging Face

You can load pre-quantized 4bit models with the argument `--load_quantized_model`.
You can load pre-quantized 4bit models with the argument `--load_quantized_model_with_inc`.
Currently, uint4 checkpoints and single device are supported.
More information on enabling 4 bit inference in SynapseAI is available here:
https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UINT4.html.
Expand All @@ -524,7 +528,35 @@ python run_lm_eval.py \
--attn_softmax_bf16 \
--bucket_size=128 \
--bucket_internal \
--load_quantized_model
--load_quantized_model_with_inc
```

### Loading 4 Bit Checkpoints from Neural Compressor (INC)

You can load a pre-quantized 4-bit checkpoint with the argument `--local_quantized_inc_model_path`, supplied with the original model with the argument `--model_name_or_path`.
Currently, only uint4 checkpoints and single-device configurations are supported.
**Note:** In this process, you can load a checkpoint that has been quantized using INC.
More information on enabling 4-bit inference in SynapseAI is available here:
https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_UINT4.html?highlight=inference%20using%20int4#enabling-and-running-uint4-in-pytorch-models.

Below is an example of loading a llama7b model with a 4bit checkpoint quantized in INC.
Please note that the model checkpoint name is denoted as `<local_model_path_from_inc>`.
Additionally, the following environment variables are used for performance optimizations and are planned to be removed in future versions:
`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1`
```bash
SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=1 \
python run_lm_eval.py \
-o acc_load_uint4_model.txt \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--use_hpu_graphs \
--use_kv_cache \
--trim_logits \
--batch_size 1 \
--bf16 \
--attn_softmax_bf16 \
--bucket_size=128 \
--bucket_internal \
--local_quantized_inc_model_path <local_model_path_from_inc> \
```

### Using Habana Flash Attention
Expand Down Expand Up @@ -555,6 +587,37 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \

For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#using-fused-sdpa).

### Running with UINT4 weight quantization using AutoGPTQ


Llama2-7b in UINT4 weight only quantization is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
Currently, the support is for UINT4 inference of pre-quantized models only.

You can run a *UINT4 weight quantized* model using AutoGPTQ by setting the following environment variables:
`SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false ENABLE_EXPERIMENTAL_FLAGS=true` before running the command,
and by adding the argument `--load_quantized_model_with_autogptq`.

***Note:***
Setting the above environment variables improves performance. These variables will be removed in future releases.


Here is an example to run a quantized model <quantized_gptq_model>:
```bash
SRAM_SLICER_SHARED_MME_INPUT_EXPANSION_ENABLED=false \
ENABLE_EXPERIMENTAL_FLAGS=true python run_generation.py \
--attn_softmax_bf16 \
--model_name_or_path <quantized_gptq_model> \
--use_hpu_graphs \
--limit_hpu_graphs \
--use_kv_cache \
--bucket_size 128 \
--bucket_internal \
--trim_logits \
--max_new_tokens 128 \
--batch_size 1 \
--bf16 \
--load_quantized_model_with_autogptq
```

## Language Model Evaluation Harness

Expand All @@ -574,6 +637,8 @@ First, you should install the requirements:
pip install -r requirements_lm_eval.txt
```

> [!NOTE]
> If custom models on hub is being used, please set env variable HF_DATASETS_TRUST_REMOTE_CODE=true instead of arg --trust_remote_code with the installed lm_eval version and dependency datasets==2.21.0
### Examples

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,5 @@
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
"allowlist": {"types": [], "names": []},
"blocklist": {"types": [], "names": []},
"dump_stats_path": "./hqt_output/measure"
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,5 @@
"method": "HOOKS",
"mode": "MEASURE",
"observer": "maxabs",
"allowlist": {"types": [], "names": []},
"blocklist": {"types": [], "names": []},
"dump_stats_path": "./hqt_output/measure",
"dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
}
"dump_stats_path": "./hqt_output/measure"
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,5 @@
"mode": "MEASURE",
"observer": "maxabs",
"measure_exclude": "NONE",
"allowlist": {"types": [], "names": []},
"blocklist": {"types": [], "names": []},
"dump_stats_path": "./hqt_output/measure"
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,5 @@
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "maxabs_hw",
"allowlist": {"types": [], "names": []},
"blocklist": {"types": [], "names": []},
"dump_stats_path": "./hqt_output/measure"
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "maxabs_hw",
"allowlist": {"types": [], "names": []},
"blocklist": {"types": [], "names": [
"matmul_qk",
"matmul_av",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"method": "HOOKS",
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "maxabs_hw",
"dump_stats_path": "./hqt_output/measure",
"scale_format": "scalar"
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,5 @@
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "unit_scale",
"allowlist": {"types": [], "names": []},
"blocklist": {"types": [], "names": []},
"dump_stats_path": "./hqt_output/measure"
}
34 changes: 30 additions & 4 deletions examples/text-generation/quantization_tools/unify_measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,24 @@ def unify_measurements(
for i in range(0, len(max_inputs)):
max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
if max_outputs is not None:
max_outputs = max(measurement_json[node_name]["outputs"], max_outputs)
if isinstance(max_outputs[0], list):
for i in range(0, len(max_outputs)):
for j in range(0, len(max_outputs[i])):
max_outputs[i][j] = max(
measurement_json[node_name]["outputs"][i][j], max_outputs[i][j]
)
else:
for i in range(0, len(max_outputs)):
max_outputs[i] = max(measurement_json[node_name]["outputs"][i], max_outputs[i])
if max_weight is not None:
max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
if isinstance(max_weight, dict):
for key, values in max_weight.items():
for i in range(0, len(values)):
max_weight[key][i] = max(
measurement_json[node_name]["params"]["weight"][key][i], max_weight[key][i]
)
else:
max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
else:
for measurement_json in measurements_jsons:
for i in range(0, len(max_inputs)):
Expand All @@ -99,9 +114,20 @@ def unify_measurements(
for i in range(0, len(max_inputs)):
unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
if max_outputs is not None:
unified_json["Nodes"][node_name]["outputs"] = max_outputs
if isinstance(max_outputs[0], list):
for i in range(0, len(max_outputs)):
for j in range(0, len(max_outputs[i])):
unified_json["Nodes"][node_name]["outputs"][i][j] = max_outputs[i][j]
else:
for i in range(0, len(max_outputs)):
unified_json["Nodes"][node_name]["outputs"][i] = max_outputs[i]
if max_weight is not None:
unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
if isinstance(max_weight, dict):
for key, values in max_weight.items():
for i in range(0, len(values)):
unified_json["Nodes"][node_name]["params"]["weight"][key][i] = max_weight[key][i]
else:
unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
else:
for i in range(0, len(max_inputs)):
for j in range(0, len(max_inputs[i])):
Expand Down
1 change: 1 addition & 0 deletions examples/text-generation/requirements_lm_eval.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
https://github.com/EleutherAI/lm-evaluation-harness/archive/0bf683b4e6a9df359b3156ba9ba8d62bdd47e0c0.zip
datasets==2.21.0
Loading

0 comments on commit 782d580

Please sign in to comment.