Skip to content

Commit 42e8199

Browse files
shihaobaibaishihaohiworldwzj
authored
add check for rope and tuning qwen3 on H200 (#880)
Co-authored-by: baishihao <baishihao@sensetime.com> Co-authored-by: wangzaijun <wzjhelloworld@qq.com>
1 parent b60ac4f commit 42e8199

5 files changed

+24
-25
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}}

lightllm/models/llama/model.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@ class LlamaFlashInferStateExtraInfo:
2626
def __init__(self, model):
2727
tp_world_size = get_dp_world_size()
2828
self.tp_q_head_num = model.config["num_attention_heads"] // tp_world_size
29-
self.tp_kv_head_num = model.config["num_key_value_heads"] // tp_world_size
30-
self.head_dim = model.config["hidden_size"] // model.config["num_attention_heads"]
29+
self.tp_kv_head_num = max(model.config["num_key_value_heads"] // tp_world_size, 1)
30+
head_dim = model.config["hidden_size"] // model.config["num_attention_heads"]
31+
self.head_dim = model.config.get("head_dim", head_dim)
3132
self.workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8).to(get_current_device_id())
3233
self.max_seq_length = model.max_seq_length
3334
self.kv_indices_buffer = [
@@ -104,33 +105,29 @@ def _init_custom(self):
104105
"""
105106
模型特殊的一些初始化
106107
"""
107-
if self.config.get("use_rope_yarn", False) or (
108-
self.config.get("rope_scaling", None) is not None
109-
and self.config.get("rope_scaling", {}).get("type", "base") == "yarn"
110-
):
108+
rope_scaling = self.config.get("rope_scaling", None)
109+
if rope_scaling is None:
110+
self._init_to_get_rotary()
111+
return
112+
113+
if "rope_type" in rope_scaling:
114+
scaling_type = rope_scaling["rope_type"]
115+
elif "type" in rope_scaling:
116+
scaling_type = rope_scaling["type"]
117+
else:
118+
raise ValueError(f"Unknown RoPE scaling format {rope_scaling}")
119+
if scaling_type == "yarn":
111120
self._init_to_get_yarn_rotary()
112-
elif self.config.get("use_dynamic_ntk", False) or (
113-
self.config.get("rope_scaling", None) is not None
114-
and self.config.get("rope_scaling", {}).get("type", "base") == "dynamic"
115-
):
121+
elif scaling_type == "dynamic":
116122
self._init_to_get_dynamic_ntk_rotary()
117-
elif (
118-
self.config.get("rope_scaling", None) is not None
119-
and self.config.get("rope_scaling", {}).get("type", "base") == "su"
120-
):
123+
elif scaling_type == "su":
121124
self._init_to_su_rotary()
122-
elif (
123-
self.config.get("rope_scaling", None) is not None
124-
and self.config.get("rope_scaling", {}).get("rope_type", "base") == "llama3"
125-
):
125+
elif scaling_type == "llama3":
126126
self._init_to_get_llama3_rotary()
127-
elif (
128-
self.config.get("rope_scaling", None) is not None
129-
and self.config.get("rope_scaling", {}).get("type", "base") == "mrope"
130-
):
127+
elif scaling_type == "mrope":
131128
self._init_to_get_mrope_rotary()
132129
else:
133-
self._init_to_get_rotary()
130+
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
134131
return
135132

136133
def _init_weights(self):
@@ -269,7 +266,6 @@ def _init_to_get_yarn_rotary(self):
269266
pos_freqs = base ** (torch.arange(0, dim, 2).float().cuda() / dim)
270267
inv_freq_extrapolation = 1.0 / pos_freqs
271268
inv_freq_interpolation = 1.0 / (scale * pos_freqs)
272-
273269
low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings)
274270
inv_freq_mask = (
275271
1 - linear_ramp_mask(low, high, dim // 2).float().cuda()

lightllm/server/api_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
111111

112112
prompt = await build_prompt(request, tools)
113113
sampling_params_dict = {
114-
"do_sample": request.do_sample,
114+
"do_sample": True,
115115
"presence_penalty": request.presence_penalty,
116116
"frequency_penalty": request.frequency_penalty,
117117
"temperature": request.temperature,

0 commit comments

Comments
 (0)