Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
158 commits
Select commit Hold shift + click to select a range
94b127b
Add Dynamic Quant Method for DSv3-R1
hlin99 Feb 17, 2025
701f77b
Add Support for 2-node vLLM Serving
hlin99 Feb 17, 2025
8a60b23
Recover Graph Warm Up
Wei-Lin-Intel Feb 17, 2025
4cf51d0
Remove Quant Scale Padding Weights
Wei-Lin-Intel Feb 18, 2025
91d31ea
add mark step in deepseek v3 to break graph into small pieces
Wei-Lin-Intel Feb 18, 2025
10d999d
update the ip and gloo
yiliu30 Feb 18, 2025
99f9e4a
fix if name
Feb 18, 2025
608ceb2
fix unquantized method
yiliu30 Feb 18, 2025
931cbfb
patch for inc
Yi4Liu Feb 18, 2025
23ca46c
Merge branch 'yang/deepseek_r1_g2_dynamic_quant' into yi-2nodes
Yi4Liu Feb 18, 2025
e882bd4
remove hpu_fused_moe
yiliu30 Feb 18, 2025
4b70608
disable log
yiliu30 Feb 18, 2025
b89268c
add debug
yiliu30 Feb 18, 2025
3946218
revert debug info
yiliu30 Feb 18, 2025
ca62b4c
uncomment debug
yiliu30 Feb 18, 2025
3b8edd8
add debug info
yiliu30 Feb 18, 2025
6358d2b
disbale init dynamic moe
yiliu30 Feb 18, 2025
c2ed14c
remove clone
yiliu30 Feb 18, 2025
fd9fa96
refine log
yiliu30 Feb 18, 2025
ebc8660
add inc quant config
Yi4Liu Feb 19, 2025
f33414e
add more quant method
Yi4Liu Feb 19, 2025
a4693bb
add more inc to hpu
Yi4Liu Feb 19, 2025
6332d93
add qconfig
Yi4Liu Feb 19, 2025
a4cdd48
add more log
Yi4Liu Feb 19, 2025
4542c1a
fix inc check
Yi4Liu Feb 19, 2025
e77173c
add rank
Yi4Liu Feb 19, 2025
12e1db3
add barr
Yi4Liu Feb 19, 2025
ea0e86f
sleep for dump
Yi4Liu Feb 19, 2025
2110a81
print model
Yi4Liu Feb 19, 2025
f32f724
add rank debug
Yi4Liu Feb 19, 2025
c3bc8ea
print inc model with rank
Yi4Liu Feb 19, 2025
294d0b2
debug more
Yi4Liu Feb 19, 2025
445c833
fixed rank debug
Yi4Liu Feb 19, 2025
c1c226f
fix quant check
Yi4Liu Feb 19, 2025
de72bc8
fix inc check
Yi4Liu Feb 19, 2025
e7f0968
fix the num_expert_group
Yi4Liu Feb 19, 2025
95ececd
fix hidden shape
Yi4Liu Feb 19, 2025
d66d0cb
fix num_expert_group
Yi4Liu Feb 19, 2025
98c1970
fix hidden shape
Yi4Liu Feb 19, 2025
f2c4964
update the num_expert_group
Yi4Liu Feb 19, 2025
ca1d161
revert weight set
Yi4Liu Feb 19, 2025
2df9dc8
disable fused moe init
Yi4Liu Feb 19, 2025
9172b49
add ep rank back
Yi4Liu Feb 19, 2025
62f9abf
fix ep_rank and ep_shift
Yi4Liu Feb 19, 2025
aee1628
clean debug info
Yi4Liu Feb 19, 2025
6d146eb
add envs info
yiliu30 Feb 19, 2025
e10e30f
add example
yiliu30 Feb 19, 2025
583f576
add g5 envs info
yiliu30 Feb 19, 2025
eb40336
add real datasets
Yi4Liu Feb 19, 2025
5f1310e
512 samples
Yi4Liu Feb 19, 2025
1ad1669
not get weight from layer
Yi4Liu Feb 20, 2025
c94fbdf
revert RAY_DEDUP_LOGS
Yi4Liu Feb 20, 2025
0571a4e
add cpu weight for n2_quant
Yi4Liu Feb 20, 2025
671b49e
add long p
Yi4Liu Feb 20, 2025
eded950
add dataset
Yi4Liu Feb 20, 2025
4bd82d5
add prompts for prep and quant
Yi4Liu Feb 20, 2025
9560531
use token directly
Yi4Liu Feb 20, 2025
7bc5e67
fix args
Yi4Liu Feb 20, 2025
0ae617a
update all examples
Yi4Liu Feb 20, 2025
d4337de
add utils
Yi4Liu Feb 20, 2025
a534805
fix
Yi4Liu Feb 20, 2025
2bcdb66
update the prompt to prompt_token_ids
Yi4Liu Feb 20, 2025
b9707ec
upadte the gen
Yi4Liu Feb 20, 2025
f9d9bff
gen pile
Yi4Liu Feb 20, 2025
4763d8e
Correct Accuracy Issue for grouped_topk and Merge pull/13474
Wei-Lin-Intel Feb 20, 2025
c05f401
use pile
Yi4Liu Feb 20, 2025
f27f3b1
refine print
Yi4Liu Feb 20, 2025
285426e
fix print
Yi4Liu Feb 20, 2025
62d7e3c
add smoke test
Yi4Liu Feb 20, 2025
ee252b9
add check nan
Yi4Liu Feb 20, 2025
2b32260
fix check nan
Yi4Liu Feb 20, 2025
bc3a26c
use p for smoke
Yi4Liu Feb 20, 2025
6a2f693
add measurement results on g4
Feb 21, 2025
1f26a84
add measurement results on g5
Feb 21, 2025
1d22c0f
add preapre smoke
Yi4Liu Feb 21, 2025
d91e1f2
use same prompt
Yi4Liu Feb 21, 2025
0a48ade
refine log
Yi4Liu Feb 21, 2025
8c24a4a
correct prepare
Yi4Liu Feb 21, 2025
b5fe4ac
remove low cpu mem in prepare
Yi4Liu Feb 21, 2025
298d4ed
get pile only
Yi4Liu Feb 21, 2025
bd9dc3b
add 4layer ep16 tp16
Feb 21, 2025
289921f
add 4 layers preapre g5
wenchao987 Feb 21, 2025
672b327
move 4 layers json info one folder
Yi4Liu Feb 21, 2025
99ae83c
move 4 layers json info one folder
Yi4Liu Feb 21, 2025
034f5cc
add ep8 tp8 example
Yi4Liu Feb 21, 2025
e92173d
add calibaration result on ep 8 tp 8
Feb 21, 2025
5811e50
add unified results
yiliu30 Feb 21, 2025
1622dd2
add npz
yiliu30 Feb 21, 2025
1dc0f42
add results
yiliu30 Feb 21, 2025
fb6eabf
add res
yiliu30 Feb 21, 2025
138f3f1
replace
yiliu30 Feb 21, 2025
ad8e2f7
update
yiliu30 Feb 21, 2025
688b332
update
yiliu30 Feb 21, 2025
743d56f
update
yiliu30 Feb 21, 2025
f9d49c1
512
yiliu30 Feb 21, 2025
1c8ac47
low cpu mem
yiliu30 Feb 21, 2025
4328b87
use 2024 for quick test
Yi4Liu Feb 22, 2025
00085d6
add docs
Yi4Liu Feb 22, 2025
1ce4308
remove measurments results
Yi4Liu Feb 22, 2025
e3e6abc
update docs and remove measure results
Yi4Liu Feb 22, 2025
82c8d66
eval bf16 model
Yi4Liu Feb 22, 2025
827bc2c
use bs 1
Yi4Liu Feb 22, 2025
a254af8
Merge branch 'yang/deepseek_r1_g2' into p22
Yi4Liu Feb 22, 2025
e8ec061
use g2 model
Yi4Liu Feb 22, 2025
1342352
disbale profile_run
Yi4Liu Feb 22, 2025
af14601
eval qmodel
Yi4Liu Feb 22, 2025
6c3ca68
run lm-eval bf16
Yi4Liu Feb 22, 2025
599ab79
print result as table
Yi4Liu Feb 22, 2025
d15a560
change max len of bf16 to 2048
Yi4Liu Feb 22, 2025
ef2e454
test 128 samples
Yi4Liu Feb 22, 2025
ded5e65
enable mla
Yi4Liu Feb 22, 2025
d67fc7a
Merge branch 'p22' into p22-rebase
Yi4Liu Feb 22, 2025
322b75d
decrease the max_model_len to 2048
Yi4Liu Feb 22, 2025
ce28d86
revert max_model_len
Yi4Liu Feb 22, 2025
4b4e196
update params
Yi4Liu Feb 23, 2025
13830cb
show mem
Yi4Liu Feb 23, 2025
f7c5324
add debug info
Yi4Liu Feb 23, 2025
574fea4
fix
Yi4Liu Feb 23, 2025
9f27adc
add more debug info
Yi4Liu Feb 23, 2025
ed45a38
fix
Yi4Liu Feb 23, 2025
31acb66
fetch one prompt once
Yi4Liu Feb 23, 2025
0aea6f8
fix print
Yi4Liu Feb 23, 2025
52777f0
use bs 1
Yi4Liu Feb 23, 2025
4440ef0
refine log
Yi4Liu Feb 23, 2025
2792f9c
revert gen
Yi4Liu Feb 23, 2025
8b4da84
use bs 1 for eval
Yi4Liu Feb 23, 2025
9dcc21b
fix lm eval
Yi4Liu Feb 23, 2025
77355ff
format code
Yi4Liu Feb 23, 2025
ed40cbd
refine eval
Yi4Liu Feb 23, 2025
110ea6f
test ray
Yi4Liu Feb 23, 2025
d2ae76f
add drop
Yi4Liu Feb 23, 2025
1cfad40
disbale TOKENIZERS_PARALLELISM
Yi4Liu Feb 23, 2025
527744f
test all
Yi4Liu Feb 23, 2025
f4f7b82
add more docs
Yi4Liu Feb 24, 2025
36fe420
run lm-eval one node
Yi4Liu Feb 24, 2025
8481ea6
cp mengni fix
Feb 24, 2025
e76c504
add inc quant smoke demo
Yi4Liu Feb 25, 2025
0f6c44d
del some attrs from self_attn
Yi4Liu Feb 26, 2025
cf7c90e
use fp kv cache
Yi4Liu Feb 26, 2025
ca37f77
update quant example
Yi4Liu Feb 26, 2025
4cc5c75
update example
Yi4Liu Feb 26, 2025
85c524b
debug shape
Yi4Liu Feb 27, 2025
1b3df4f
debug more
Yi4Liu Feb 27, 2025
5c2227a
add markstep
Yi4Liu Feb 27, 2025
e099d36
use bs 1
Yi4Liu Feb 27, 2025
0e1aebb
add more debug info
Yi4Liu Feb 27, 2025
46c5ca9
update log
Yi4Liu Feb 27, 2025
f5276a5
update log
Yi4Liu Feb 27, 2025
43e075e
debug
Yi4Liu Feb 27, 2025
caadc7b
add prefix
Yi4Liu Feb 27, 2025
3c3eae2
debug
Yi4Liu Feb 27, 2025
95b5e02
debug lm-head
Yi4Liu Feb 27, 2025
0ea2f5c
debug logist
Yi4Liu Feb 27, 2025
27bb439
update log
Yi4Liu Feb 27, 2025
473411c
fix
Yi4Liu Feb 27, 2025
7af0fb0
update
Yi4Liu Feb 27, 2025
1f64664
update gpu mem utilization to 0.8
Yi4Liu Feb 27, 2025
ae0e2bb
refine debug
Yi4Liu Feb 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions scripts/Quantize_BF16_R1_on_Single_Note.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Note for quantize vLLM DeepSeek V3/R1 using INC

## Perquisites

- Hardware: ~~2xG3~~ ~~2x8XG3 or 2x8XG2~~ 8XG2 or 8XG3
- Docker: 1.20.0-521

- INC https://github.com/intel/neural-compressor/tree/dev/yi/quant_vllm-patch-19

```bash
git clone https://github.com/intel/neural-compressor.git inc
cd inc
git checkout dev/yi/quant_vllm-patch-19
pip install -r requirements.txt
pip install -r requirements_pt.txt
python setup.py pt develop
```
- vLLM https://github.com/yiliu30/vllm-fork/pull/13

```
cd vllm; pip install -r requirements-hpu.txt; VLLM_TARGET_DEVICE=hpu pip install -e . --no-build-isolation;
```
- Model
- ~~Reduced DeepSeek V3 model (4 layers with random weights)~~
- ~~Reduced DeepSeek V3 model (4 layers with real weights)~~
- DeepSeek R1 (BF16)

## Example
- Quantize the BF16 model using the unified measurement results on 2x8XG2.


```bash
# vllm root
cd vllm
cd scripts
# Download the unified measurement results
# Make sure that the `nc_workspace_tmp` is under the `scripts` folder.
git clone https://huggingface.co/Yi30/nc_workspace_tmp
# Run example
python n2_ep8_tp8.py --mode q
```

> [!CAUTION]
> - The `QUANT_CONFIG` was hard-coded in [1](https://github.com/yiliu30/vllm-fork/blob/bc3a26c3d6143b6405ef9af7e06f6eddcbcbdad0/scripts/g4_multi_nodes_source.sh#L34C8-L34C20) and [2](https://github.com/yiliu30/vllm-fork/blob/bc3a26c3d6143b6405ef9af7e06f6eddcbcbdad0/scripts/g5_multi_nodes_source.sh#L38).
> - `VLLMKVCache`, `KVCache` and `lm-head` were skipped to quantize, will add them back.
> - ~~FAKE `EP` was hard-coded as 16. Please check `TEMP_EP` in vllm and `DEEPSEEK_EP` in INC.~~


## Others
- 1. Measured on 2x8G2 w/ 513 samples https://huggingface.co/Yi30/nc_workspace_tmp_pile_512_backup
- 2. 4 layers smoke on 8G2 test https://huggingface.co/Yi30/nc_workspace_tmp_4l_ep8_tp8
- 3. Merged result of 1) https://huggingface.co/Yi30/nc_workspace_tmp
- 4. 4 layers on 2x8G2 https://huggingface.co/Yi30/nc_workspace_tmp_4l_smoke
41 changes: 41 additions & 0 deletions scripts/check_nan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import json
import math


def check_values(obj, key_path="", filename=""):
"""Recursively checks if innermost values are valid numbers, prints issues."""
if isinstance(obj, dict):
for key, value in obj.items():
new_key_path = f"{key_path}.{key}" if key_path else key
check_values(value, new_key_path, filename)
elif isinstance(obj, list):
for idx, item in enumerate(obj):
check_values(item, f"{key_path}[{idx}]", filename)
else:
if (
not isinstance(obj, (int, float))
or math.isnan(obj)
or math.isinf(obj)
):
print(f"Invalid number in {filename} at '{key_path}': {obj}")


def check_json_files(directory):
"""Iterates through all JSON files in a directory and checks their values."""
for filename in os.listdir(directory):
if "mod_list" in filename:
continue
if filename.endswith(".json"):
filepath = os.path.join(directory, filename)
try:
with open(filepath, "r", encoding="utf-8") as file:
data = json.load(file)
check_values(data, filename=filename)
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading {filename}: {e}")


# Set your directory containing JSON files
json_directory = "./nc_workspace_tmp/" # Change this to your actual directory
check_json_files(json_directory)
176 changes: 176 additions & 0 deletions scripts/convert_bf16_to_fp8_dyn_quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import os
import torch
import tqdm
from loguru import logger
import logging
import safetensors
from safetensors import safe_open
from safetensors.torch import save_file
import json

logging.basicConfig(level=logging.DEBUG)
torch.set_grad_enabled(False)

# CONSTANTS
SAFETENSORS = "safetensors"
WEIGHT_SCALE_NAME = "weight_scale_inv" #"scale_weight"
INPUT_SCALE_NAME = "scale_input"
SCALE_DTYPE = torch.bfloat16
SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
WEIGHT_BACKOFF = 0.5
QUANT_MODULE_TYPES = (torch.nn.Linear,)
SKIP_WEIGHT_LST = {
"enorm.weight",
"hnorm.weight",
"eh_proj.weight",
"shared_head.norm.weight",
"shared_head.head.weight",
"model.norm",
"layernorm",
"e_score_correction_bias",
"lm_head.weight",
"embed_tokens",
"mlp.gate.weight", # mlp.gate is not linear
}
"""
# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
"""
MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"


def skip_weight(weight_name):
return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST])


def get_cpu_mem_size_in_gb():
import psutil

mem = psutil.virtual_memory()
return mem.available


def get_all_weight_filename(model_path):
all_files = os.listdir(model_path)
all_weight_filename = []
for file in all_files:
if file.endswith(f".{SAFETENSORS}"):
all_weight_filename.append(file)
return all_weight_filename


# from _fp8_quant/_core/fp_utils.py
def calc_maxabs_scale(xmaxabs, fullscale, backoff=1):
scale = xmaxabs / (fullscale * backoff)
return scale


def quant_tensor(tensor):
# Note:
# 1. Check the scale dtype
# 2. Check the scale shape
amax = tensor.abs().max(dim=1).values + 1e-8
scale = calc_maxabs_scale(amax, FULL_RANGE, WEIGHT_BACKOFF)
scale = scale.to(SCALE_DTYPE)
qtensor = tensor / scale.unsqueeze(1)
cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
return scale.float(), cliped_qtensor_fp8


def _maybe_create_dir(qmodel_path):
if not os.path.exists(qmodel_path):
os.makedirs(qmodel_path)


def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
_maybe_create_dir(qmodel_path)
all_weight_filename = get_all_weight_filename(model_path)
files_cnt = len(all_weight_filename)
logger.info(f"Got {len(all_weight_filename)} weight files")
qtensor_mappping = {}
for i, filename in enumerate(all_weight_filename):
logger.info(f"Processing {i + 1}/{len(all_weight_filename)}: {filename}")
file_path = os.path.join(model_path, filename)
qmodel_file_name = filename
qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name)
qtensors = {}
with safe_open(file_path, framework="pt", device="cpu") as f:
for weight_name in f.keys():
weight = f.get_tensor(weight_name)
if skip_weight(weight_name):
logger.debug(f"Skipping quantize {weight_name}")
qtensors[weight_name] = weight
qtensor_mappping[weight_name] = qmodel_file_name
continue
logger.debug(f"[{i+1}/{files_cnt}] Processing {weight_name}")
scale, qtensor = quant_tensor(weight)
preifx_name = weight_name[: -len(".weight")]
scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}"
qtensors[scale_name] = scale
qtensors[weight_name] = qtensor
qtensor_mappping[scale_name] = qmodel_file_name
qtensor_mappping[weight_name] = qmodel_file_name
logger.debug(f"[{i+1}/{files_cnt}] Saving {len(qtensors)} tensors to {qmodel_file_path}")
save_file(qtensors, os.path.join(qmodel_path, qmodel_file_path))
# Dump tensor mapping into json file
model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME)
logger.info(f"Saving tensor mapping to {model_state_dict_mapping_file_path}")
state_dict_mapping = {
"metadata":{},
"weight_map": qtensor_mappping,
}
with open(model_state_dict_mapping_file_path, "w") as f:
json.dump(state_dict_mapping, f, indent=4)


def _import_oh():
import transformers
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

orig_check_support_param_buffer_assignment = transformers.modeling_utils.check_support_param_buffer_assignment
adapt_transformers_to_gaudi()
transformers.modeling_utils.check_support_param_buffer_assignment = orig_check_support_param_buffer_assignment


@torch.no_grad()
def static_quant_model_tran(model_path, qmodel_path):
# assert get_cpu_mem_size_in_gb(800), "Not enough memory, please use quant_model_weight_with_low_cpu_usage"
import transformers
from patch_for_ds import patch_transformers

# import_oh()
patch_transformers()
model = transformers.AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
low_cpu_mem_usage=True,
trust_remote_code=True,
)
for name, module in model.named_modules():
if not isinstance(module, QUANT_MODULE_TYPES) or skip_weight(name):
logger.debug(f"Skipping quantize {name}")
continue
logger.debug(f"Processing {name}")
weight = module.weight
scale, qtensor = quant_tensor(weight)
module.weight.data = qtensor
setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False))
logger.info(f"Saving quantized model to {qmodel_path}")
model.save_pretrained(qmodel_path)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--qmodel_path", type=str, required=True)
parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage")
args = parser.parse_args()
if args.low_cpu_mem:
quant_model_weight_with_low_cpu_usage(args.model_path, args.qmodel_path)
else:
static_quant_model_tran(args.model_path, args.qmodel_path)

61 changes: 61 additions & 0 deletions scripts/g4_multi_nodes_source.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#! /bin/bash
# set -x
BASH_DIR=$(dirname "${BASH_SOURCE[0]}")
source "$BASH_DIR"/utils.sh
ray stop --force
# DO NOT change unless you fully undersand its purpose
export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
export HCCL_OVER_OFI=1
export HCCL_GAUDI_DIRECT=1
export HCCL_SOCKET_IFNAME=enx6c1ff7012f87
export LIBFABRIC_ROOT=/opt/habanalabs/libfabric-1.22.0
export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib:/opt/habanalabs/libfabric-1.22.0/lib:/usr/lib/habanalabs
export GLOO_SOCKET_IFNAME=enx6c1ff7012f87
export VLLM_HOST_IP=10.239.128.244
export HABANA_VISIBLE_DEVICES="ALL"
export VLLM_MLA_DISABLE_REQUANTIZATION=1
export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
export RAY_IGNORE_UNHANDLED_ERRORS="1"
export PT_HPU_WEIGHT_SHARING=0
export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7"
export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
export VLLM_MOE_N_SLICE=8
export VLLM_EP_SIZE=16
export VLLM_TP_SIZE=16
export PT_HPU_RECIPE_CACHE_CONFIG=/tmp/recipe_cache,True,16384
export VLLM_SKIP_WARMUP="true"
export VLLM_LOGGING_LEVEL="DEBUG"
block_size=128
# DO NOT change ends...

# INC
export QUANT_CONFIG="/mnt/disk3/yiliu4/vllm-fork/scripts"

# memory footprint tunning params
export VLLM_GPU_MEMORY_UTILIZATION=0.98
export VLLM_GRAPH_RESERVED_MEM=0.35
export VLLM_GRAPH_PROMPT_RATIO=0
# params
# max_num_batched_tokens=2048
# max_num_seqs=1024
# input_min=1024
# input_max=4096
# output_max=1024

# Fot prepare
max_num_batched_tokens=2048
max_num_seqs=1024
input_min=1024
input_max=1024
output_max=32


unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
set_bucketing
echo " environments are reseted "
env | grep VLLM
60 changes: 60 additions & 0 deletions scripts/g5_multi_nodes_source.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#! /bin/bash
# set -x
BASH_DIR=$(dirname "${BASH_SOURCE[0]}")
source "$BASH_DIR"/utils.sh
ray stop --force
# DO NOT change unless you fully undersand its purpose
export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
export HCCL_OVER_OFI=1
export HCCL_GAUDI_DIRECT=1
export HCCL_SOCKET_IFNAME=enx6c1ff7012f4d
export LIBFABRIC_ROOT=/opt/habanalabs/libfabric-1.22.0
export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib:/opt/habanalabs/libfabric-1.22.0/lib:/usr/lib/habanalabs
export GLOO_SOCKET_IFNAME=enx6c1ff7012f4d
export VLLM_HOST_IP=10.239.129.40
export HABANA_VISIBLE_DEVICES="ALL"
export VLLM_MLA_DISABLE_REQUANTIZATION=1
export PT_HPU_ENABLE_LAZY_COLLECTIVES="true"
export VLLM_RAY_DISABLE_LOG_TO_DRIVER="1"
export RAY_IGNORE_UNHANDLED_ERRORS="1"
export PT_HPU_WEIGHT_SHARING=0
export HABANA_VISIBLE_MODULES="0,1,2,3,4,5,6,7"
export PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1
export VLLM_MOE_N_SLICE=8
export VLLM_EP_SIZE=16
export VLLM_TP_SIZE=16
export PT_HPU_RECIPE_CACHE_CONFIG=/tmp/recipe_cache,True,16384
export VLLM_SKIP_WARMUP="true"
export VLLM_LOGGING_LEVEL="DEBUG"
block_size=128
# DO NOT change ends...
# memory footprint tunning params
export VLLM_GPU_MEMORY_UTILIZATION=0.98
export VLLM_GRAPH_RESERVED_MEM=0.35
export VLLM_GRAPH_PROMPT_RATIO=0

# INC
export QUANT_CONFIG="/mnt/disk3/yiliu4/vllm-fork/scripts"

# params
# max_num_batched_tokens=2048
# max_num_seqs=1024
# input_min=1024
# input_max=4096
# output_max=1024

# Fot prepare
max_num_batched_tokens=2048
max_num_seqs=1024
input_min=1024
input_max=1024
output_max=32

unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
set_bucketing
echo " environments are reseted "
env | grep VLLM
Loading