Skip to content

Commit 769718c

Browse files
authored
rename config to align paddlefleet (#3022)
1 parent 7126879 commit 769718c

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

examples/experiments/paddlefleet/glm45_provider.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,19 @@ class GLMMoEModelProvider(GPTModelProvider):
4646
add_qkv_bias: bool = True
4747
seq_length: int = 131072
4848
init_method_std: int = 0.02
49-
hidden_dropout: float = 0.0
49+
hidden_dropout_prob: float = 0.0
5050
vocab_size: int = 151552
5151
share_embeddings_and_output_weights: Optional[bool] = False
52-
layernorm_epsilon: float = 1e-5
52+
rms_norm_eps: float = 1e-5
5353
autocast_dtype: paddle.dtype = paddle.bfloat16
5454
params_dtype: paddle.dtype = paddle.bfloat16
5555
bf16: bool = True
5656

5757
# Attention
58-
num_query_groups: int = 8
58+
num_key_value_heads: int = 8
5959
num_attention_heads: int = 96
6060
attention_dropout: float = 0.0
61-
kv_channels: int = 128
61+
head_dim: int = 128
6262

6363
# RoPE
6464
position_embedding_type: str = "rope"
@@ -85,7 +85,7 @@ class GLMMoEModelProvider(GPTModelProvider):
8585
bias_dropout_fusion: bool = True
8686

8787
# MTP
88-
mtp_num_layers: Optional[int] = 1
88+
num_nextn_predict_layers: Optional[int] = 1
8989
mtp_loss_scaling_factor: Optional[
9090
float
9191
] = 0.3 # https://arxiv.org/pdf/2508.06471 0.3 for the first 15T tokens, 0.1 for the remaining tokens.
@@ -97,10 +97,10 @@ class GLM45ModelProvider355B(GLMMoEModelProvider):
9797
Provider for GLM 4.5 355B-A32B: https://huggingface.co/zai-org/GLM-4.5
9898
"""
9999

100-
num_layers: int = 92
100+
num_hidden_layers: int = 92
101101
moe_num_experts: int = 160
102102
hidden_size: int = 5120
103-
ffn_hidden_size: int = 12288
103+
intermediate_size: int = 12288
104104
moe_layer_freq: Union[int, List[int]] = field(
105105
default_factory=lambda: [0] * 3 + [1] * 89
106106
) # first three layers are dense
@@ -116,10 +116,10 @@ class GLM45AirModelProvider106B(GLMMoEModelProvider):
116116
Provider for GLM 4.5 Air 106B-A12B: https://huggingface.co/zai-org/GLM-4.5-Air
117117
"""
118118

119-
num_layers: int = 46
119+
num_hidden_layers: int = 46
120120
moe_num_experts: int = 128
121121
hidden_size: int = 4096
122-
ffn_hidden_size: int = 10944
122+
intermediate_size: int = 10944
123123
moe_layer_freq: Union[int, List[int]] = field(
124124
default_factory=lambda: [0] * 1 + [1] * 45
125125
) # first one layer is dense
@@ -135,11 +135,11 @@ class GLM45AirModelDebugProvider(GLM45AirModelProvider106B):
135135
Provider for GLM 4.5 Air 106B-A12B: https://huggingface.co/zai-org/GLM-4.5-Air
136136
"""
137137

138-
num_layers: int = 10
138+
num_hidden_layers: int = 10
139139
moe_num_shared_experts: int = 1
140140
hidden_size: int = 128
141-
ffn_hidden_size: int = 128
141+
intermediate_size: int = 128
142142
moe_intermediate_size: int = 1408
143-
mtp_num_layers: Optional[int] = 0
143+
num_nextn_predict_layers: Optional[int] = 0
144144
use_bias: bool = False
145145
vocab_size: int = 37888

0 commit comments

Comments
 (0)