yiliu30 · yiliu30 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 18, 2025
diff --git a/scripts/Quantize_BF16_R1_on_Single_Note.md b/scripts/Quantize_BF16_R1_on_Single_Note.md
@@ -0,0 +1,53 @@
+# Note for quantize vLLM DeepSeek V3/R1 using INC
+
+## Perquisites
+
+- Hardware: ~~2xG3~~  ~~2x8XG3 or 2x8XG2~~ 8XG2 or 8XG3
+- Docker: 1.20.0-521
+
+- INC https://github.com/intel/neural-compressor/tree/dev/yi/quant_vllm-patch-19
+
+```bash
+git clone https://github.com/intel/neural-compressor.git inc
+cd inc
+git checkout dev/yi/quant_vllm-patch-19
+pip install -r requirements.txt
+pip install -r requirements_pt.txt
+python setup.py pt develop
+```
+- vLLM  https://github.com/yiliu30/vllm-fork/pull/13
+
+```
+cd vllm;  pip install -r requirements-hpu.txt; VLLM_TARGET_DEVICE=hpu pip install -e .  --no-build-isolation;
+```
+- Model
+    - ~~Reduced DeepSeek V3 model (4 layers with random weights)~~
+    -  ~~Reduced DeepSeek V3 model (4 layers with real weights)~~
+    - DeepSeek R1 (BF16)
+
+## Example
+- Quantize the BF16 model using the unified measurement results on 2x8XG2.
+
+
+```bash
+# vllm root
+cd vllm
+cd scripts
+# Download the unified measurement results
+# Make sure that the `nc_workspace_tmp` is under the `scripts` folder.
+git clone https://huggingface.co/Yi30/nc_workspace_tmp
+# Run example
+python n2_ep8_tp8.py --mode q
+```
+
+> [!CAUTION]
+> - The `QUANT_CONFIG` was hard-coded in [1](https://github.com/yiliu30/vllm-fork/blob/bc3a26c3d6143b6405ef9af7e06f6eddcbcbdad0/scripts/g4_multi_nodes_source.sh#L34C8-L34C20) and [2](https://github.com/yiliu30/vllm-fork/blob/bc3a26c3d6143b6405ef9af7e06f6eddcbcbdad0/scripts/g5_multi_nodes_source.sh#L38).
+> - `VLLMKVCache`, `KVCache` and `lm-head` were skipped to quantize, will add them back.
+> - ~~FAKE `EP` was hard-coded as 16. Please check `TEMP_EP` in vllm and `DEEPSEEK_EP` in INC.~~
+
+
+## Others
+- 1. Measured on 2x8G2 w/ 513 samples https://huggingface.co/Yi30/nc_workspace_tmp_pile_512_backup
+- 2. 4 layers smoke on 8G2 test https://huggingface.co/Yi30/nc_workspace_tmp_4l_ep8_tp8
+- 3. Merged result of 1) https://huggingface.co/Yi30/nc_workspace_tmp
+- 4. 4 layers on 2x8G2 https://huggingface.co/Yi30/nc_workspace_tmp_4l_smoke
diff --git a/scripts/check_nan.py b/scripts/check_nan.py
@@ -0,0 +1,41 @@
+import os
+import json
+import math
+
+
+def check_values(obj, key_path="", filename=""):
+    """Recursively checks if innermost values are valid numbers, prints issues."""
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            new_key_path = f"{key_path}.{key}" if key_path else key
+            check_values(value, new_key_path, filename)
+    elif isinstance(obj, list):
+        for idx, item in enumerate(obj):
+            check_values(item, f"{key_path}[{idx}]", filename)
+    else:
+        if (
+            not isinstance(obj, (int, float))
+            or math.isnan(obj)
+            or math.isinf(obj)
+        ):
+            print(f"Invalid number in {filename} at '{key_path}': {obj}")
+
+
+def check_json_files(directory):
+    """Iterates through all JSON files in a directory and checks their values."""
+    for filename in os.listdir(directory):
+        if "mod_list" in filename:
+            continue 
+        if filename.endswith(".json"):
+            filepath = os.path.join(directory, filename)
+            try:
+                with open(filepath, "r", encoding="utf-8") as file:
+                    data = json.load(file)
+                    check_values(data, filename=filename)
+            except (json.JSONDecodeError, IOError) as e:
+                print(f"Error reading {filename}: {e}")
+
+
+# Set your directory containing JSON files
+json_directory = "./nc_workspace_tmp/"  # Change this to your actual directory
+check_json_files(json_directory)
diff --git a/scripts/convert_bf16_to_fp8_dyn_quant.py b/scripts/convert_bf16_to_fp8_dyn_quant.py
@@ -0,0 +1,176 @@
+import os
+import torch
+import tqdm
+from loguru import logger
+import logging
+import safetensors
+from safetensors import safe_open
+from safetensors.torch import save_file
+import json
+
+logging.basicConfig(level=logging.DEBUG)
+torch.set_grad_enabled(False)
+
+# CONSTANTS
+SAFETENSORS = "safetensors"
+WEIGHT_SCALE_NAME = "weight_scale_inv" #"scale_weight"
+INPUT_SCALE_NAME = "scale_input"
+SCALE_DTYPE = torch.bfloat16
+SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
+FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+WEIGHT_BACKOFF = 0.5
+QUANT_MODULE_TYPES = (torch.nn.Linear,)
+SKIP_WEIGHT_LST = {
+    "enorm.weight",
+    "hnorm.weight",
+    "eh_proj.weight",
+    "shared_head.norm.weight",
+    "shared_head.head.weight",
+    "model.norm",
+    "layernorm",
+    "e_score_correction_bias",
+    "lm_head.weight",
+    "embed_tokens",
+    "mlp.gate.weight",  # mlp.gate is not linear
+}
+"""
+# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
+Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
+"""
+MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
+
+
+def skip_weight(weight_name):
+    return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST])
+
+
+def get_cpu_mem_size_in_gb():
+    import psutil
+
+    mem = psutil.virtual_memory()
+    return mem.available
+
+
+def get_all_weight_filename(model_path):
+    all_files = os.listdir(model_path)
+    all_weight_filename = []
+    for file in all_files:
+        if file.endswith(f".{SAFETENSORS}"):
+            all_weight_filename.append(file)
+    return all_weight_filename
+
+
+# from _fp8_quant/_core/fp_utils.py
+def calc_maxabs_scale(xmaxabs, fullscale, backoff=1):
+    scale = xmaxabs / (fullscale * backoff)
+    return scale
+
+
+def quant_tensor(tensor):
+    # Note:
+    #  1. Check the scale dtype
+    #  2. Check the scale shape
+    amax = tensor.abs().max(dim=1).values + 1e-8
+    scale = calc_maxabs_scale(amax, FULL_RANGE, WEIGHT_BACKOFF)
+    scale = scale.to(SCALE_DTYPE)
+    qtensor = tensor / scale.unsqueeze(1)
+    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
+    return scale.float(), cliped_qtensor_fp8
+
+
+def _maybe_create_dir(qmodel_path):
+    if not os.path.exists(qmodel_path):
+        os.makedirs(qmodel_path)
+
+
+def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
+    _maybe_create_dir(qmodel_path)
+    all_weight_filename = get_all_weight_filename(model_path)
+    files_cnt = len(all_weight_filename)
+    logger.info(f"Got {len(all_weight_filename)} weight files")
+    qtensor_mappping = {}
+    for i, filename in enumerate(all_weight_filename):
+        logger.info(f"Processing {i + 1}/{len(all_weight_filename)}: {filename}")
+        file_path = os.path.join(model_path, filename)
+        qmodel_file_name = filename
+        qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name)
+        qtensors = {}
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for weight_name in f.keys():
+                weight = f.get_tensor(weight_name)
+                if skip_weight(weight_name):
+                    logger.debug(f"Skipping quantize {weight_name}")
+                    qtensors[weight_name] = weight
+                    qtensor_mappping[weight_name] = qmodel_file_name
+                    continue
+                logger.debug(f"[{i+1}/{files_cnt}] Processing {weight_name}")
+                scale, qtensor = quant_tensor(weight)
+                preifx_name = weight_name[: -len(".weight")]
+                scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}"
+                qtensors[scale_name] = scale
+                qtensors[weight_name] = qtensor
+                qtensor_mappping[scale_name] = qmodel_file_name
+                qtensor_mappping[weight_name] = qmodel_file_name
+        logger.debug(f"[{i+1}/{files_cnt}] Saving {len(qtensors)} tensors to {qmodel_file_path}")
+        save_file(qtensors, os.path.join(qmodel_path, qmodel_file_path))
+    # Dump tensor mapping into json file
+    model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME)
+    logger.info(f"Saving tensor mapping to {model_state_dict_mapping_file_path}")
+    state_dict_mapping = {
+        "metadata":{},
+        "weight_map": qtensor_mappping,
+    }
+    with open(model_state_dict_mapping_file_path, "w") as f:
+        json.dump(state_dict_mapping, f, indent=4)
+
+
+def _import_oh():
+    import transformers
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+    orig_check_support_param_buffer_assignment = transformers.modeling_utils.check_support_param_buffer_assignment
+    adapt_transformers_to_gaudi()
+    transformers.modeling_utils.check_support_param_buffer_assignment = orig_check_support_param_buffer_assignment
+
+
+@torch.no_grad()
+def static_quant_model_tran(model_path, qmodel_path):
+    # assert get_cpu_mem_size_in_gb(800), "Not enough memory, please use quant_model_weight_with_low_cpu_usage"
+    import transformers
+    from patch_for_ds import patch_transformers
+
+    # import_oh()
+    patch_transformers()
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
+    for name, module in model.named_modules():
+        if not isinstance(module, QUANT_MODULE_TYPES) or skip_weight(name):
+            logger.debug(f"Skipping quantize {name}")
+            continue
+        logger.debug(f"Processing {name}")
+        weight = module.weight
+        scale, qtensor = quant_tensor(weight)
+        module.weight.data = qtensor
+        setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False))
+    logger.info(f"Saving quantized model to {qmodel_path}")
+    model.save_pretrained(qmodel_path)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--qmodel_path", type=str, required=True)
+    parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage")
+    args = parser.parse_args()
+    if args.low_cpu_mem:
+        quant_model_weight_with_low_cpu_usage(args.model_path, args.qmodel_path)
+    else:
+        static_quant_model_tran(args.model_path, args.qmodel_path)
+
diff --git a/scripts/g4_multi_nodes_source.sh b/scripts/g4_multi_nodes_source.sh
@@ -0,0 +1,61 @@
+#! /bin/bash
+# set -x
+BASH_DIR=$(dirname "${BASH_SOURCE[0]}")
+source "$BASH_DIR"/utils.sh
+ray stop --force
+# DO NOT change unless you fully undersand its purpose
+export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
+export HCCL_OVER_OFI=1
+export HCCL_GAUDI_DIRECT=1
+export HCCL_SOCKET_IFNAME=enx6c1ff7012f87
+export LIBFABRIC_ROOT=/opt/habanalabs/libfabric-1.22.0
+export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib:/opt/habanalabs/libfabric-1.22.0/lib:/usr/lib/habanalabs
+export GLOO_SOCKET_IFNAME=enx6c1ff7012f87
+export VLLM_HOST_IP=10.239.128.244
+export HABANA_VISIBLE_DEVICES="ALL"
+export VLLM_MLA_DISABLE_REQUANTIZATION=1
+export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
+export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
+export RAY_IGNORE_UNHANDLED_ERRORS="1"
+export PT_HPU_WEIGHT_SHARING=0
+export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7"
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
+export VLLM_MOE_N_SLICE=8
+export VLLM_EP_SIZE=16
+export VLLM_TP_SIZE=16
+export PT_HPU_RECIPE_CACHE_CONFIG=/tmp/recipe_cache,True,16384
+export VLLM_SKIP_WARMUP="true"
+export VLLM_LOGGING_LEVEL="DEBUG"
+block_size=128
+# DO NOT change ends...
+
+# INC
+export QUANT_CONFIG="/mnt/disk3/yiliu4/vllm-fork/scripts"
+
+# memory footprint tunning params
+export VLLM_GPU_MEMORY_UTILIZATION=0.98
+export VLLM_GRAPH_RESERVED_MEM=0.35
+export VLLM_GRAPH_PROMPT_RATIO=0
+# params
+# max_num_batched_tokens=2048
+# max_num_seqs=1024
+# input_min=1024
+# input_max=4096
+# output_max=1024
+
+# Fot prepare
+max_num_batched_tokens=2048
+max_num_seqs=1024
+input_min=1024
+input_max=1024
+output_max=32
+
+
+unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
+unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
+unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
+unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
+set_bucketing
+echo " environments are reseted "
+env | grep VLLM
diff --git a/scripts/g5_multi_nodes_source.sh b/scripts/g5_multi_nodes_source.sh
@@ -0,0 +1,60 @@
+#! /bin/bash
+# set -x
+BASH_DIR=$(dirname "${BASH_SOURCE[0]}")
+source "$BASH_DIR"/utils.sh
+ray stop --force
+# DO NOT change unless you fully undersand its purpose
+export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
+export HCCL_OVER_OFI=1
+export HCCL_GAUDI_DIRECT=1
+export HCCL_SOCKET_IFNAME=enx6c1ff7012f4d
+export LIBFABRIC_ROOT=/opt/habanalabs/libfabric-1.22.0
+export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib:/opt/habanalabs/libfabric-1.22.0/lib:/usr/lib/habanalabs
+export GLOO_SOCKET_IFNAME=enx6c1ff7012f4d
+export VLLM_HOST_IP=10.239.129.40
+export HABANA_VISIBLE_DEVICES="ALL"
+export VLLM_MLA_DISABLE_REQUANTIZATION=1
+export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
+export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
+export RAY_IGNORE_UNHANDLED_ERRORS="1"
+export PT_HPU_WEIGHT_SHARING=0
+export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7"
+export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
+export VLLM_MOE_N_SLICE=8
+export VLLM_EP_SIZE=16
+export VLLM_TP_SIZE=16
+export PT_HPU_RECIPE_CACHE_CONFIG=/tmp/recipe_cache,True,16384
+export VLLM_SKIP_WARMUP="true"
+export VLLM_LOGGING_LEVEL="DEBUG"
+block_size=128
+# DO NOT change ends...
+# memory footprint tunning params
+export VLLM_GPU_MEMORY_UTILIZATION=0.98
+export VLLM_GRAPH_RESERVED_MEM=0.35
+export VLLM_GRAPH_PROMPT_RATIO=0
+
+# INC
+export QUANT_CONFIG="/mnt/disk3/yiliu4/vllm-fork/scripts"
+
+# params
+# max_num_batched_tokens=2048
+# max_num_seqs=1024
+# input_min=1024
+# input_max=4096
+# output_max=1024
+
+# Fot prepare
+max_num_batched_tokens=2048
+max_num_seqs=1024
+input_min=1024
+input_max=1024
+output_max=32
+
+unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
+unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
+unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
+unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
+set_bucketing
+echo " environments are reseted "
+env | grep VLLM