Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions fastdeploy/model_executor/models/glm4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from paddleformers.utils.log import logger

from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import (
support_graph_optimization,
Expand Down Expand Up @@ -160,7 +159,6 @@ def __init__(

self.experts = FusedMoE(
fd_config,
reduce_results=False,
renormalize=self.norm_topk_prob,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.model_config.n_routed_experts,
Expand All @@ -174,23 +172,21 @@ def __init__(
weight_key_map=weight_key_map,
)

shared_experts_intermediate_size = self.n_shared_experts * fd_config.model_config.moe_intermediate_size

self.shared_experts = Glm4MoeMLP(
fd_config=fd_config,
intermediate_size=shared_experts_intermediate_size,
layer_id=layer_id,
prefix=f"{prefix}.shared_experts",
reduce_results=False,
)
if self.n_shared_experts > 0:
shared_experts_intermediate_size = self.n_shared_experts * fd_config.model_config.moe_intermediate_size
self.shared_experts = Glm4MoeMLP(
fd_config=fd_config,
intermediate_size=shared_experts_intermediate_size,
layer_id=layer_id,
prefix=f"{prefix}.shared_experts",
)

def forward(self, x, forward_meta: ForwardMeta = None):
shared_experts_out = self.shared_experts(x)
out = self.experts(x, self.gate, forward_meta)
out = out + shared_experts_out
# We do to TP all reduce after the sum of experts.
if self.tensor_parallel_size > 1:
out = tensor_model_parallel_all_reduce(out, self.tp_group)
if self.n_shared_experts > 0:
shared_experts_out = self.shared_experts(x)
out = out + shared_experts_out

return out


Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload):
# 校验返回内容与概率信息
assert (
resp_json["choices"][0]["message"]["content"]
== "\n<think>这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典"
== "\n<think>我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典"
), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}."


Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/utils/rollout_routing_replay_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,11 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/"
model_path = os.getenv("MODEL_PATH")
if model_path:
baseline_path = os.path.join(model_path, f"R3_BaseLine_dev_uint8/routing_replay_output_baseline_{model_name}")
baseline_path = os.path.join(
model_path, f"R3_BaseLine_dev_uint8_0205/routing_replay_output_baseline_{model_name}"
)
else:
baseline_path = f"./R3_BaseLine_dev_uint8/routing_replay_output_baseline_{model_name}"
baseline_path = f"./R3_BaseLine_dev_uint8_0205/routing_replay_output_baseline_{model_name}"
stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")

nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")
Expand Down
Loading