@@ -46,19 +46,19 @@ class GLMMoEModelProvider(GPTModelProvider):
4646 add_qkv_bias : bool = True
4747 seq_length : int = 131072
4848 init_method_std : int = 0.02
49- hidden_dropout : float = 0.0
49+ hidden_dropout_prob : float = 0.0
5050 vocab_size : int = 151552
5151 share_embeddings_and_output_weights : Optional [bool ] = False
52- layernorm_epsilon : float = 1e-5
52+ rms_norm_eps : float = 1e-5
5353 autocast_dtype : paddle .dtype = paddle .bfloat16
5454 params_dtype : paddle .dtype = paddle .bfloat16
5555 bf16 : bool = True
5656
5757 # Attention
58- num_query_groups : int = 8
58+ num_key_value_heads : int = 8
5959 num_attention_heads : int = 96
6060 attention_dropout : float = 0.0
61- kv_channels : int = 128
61+ head_dim : int = 128
6262
6363 # RoPE
6464 position_embedding_type : str = "rope"
@@ -85,7 +85,7 @@ class GLMMoEModelProvider(GPTModelProvider):
8585 bias_dropout_fusion : bool = True
8686
8787 # MTP
88- mtp_num_layers : Optional [int ] = 1
88+ num_nextn_predict_layers : Optional [int ] = 1
8989 mtp_loss_scaling_factor : Optional [
9090 float
9191 ] = 0.3 # https://arxiv.org/pdf/2508.06471 0.3 for the first 15T tokens, 0.1 for the remaining tokens.
@@ -97,10 +97,10 @@ class GLM45ModelProvider355B(GLMMoEModelProvider):
9797 Provider for GLM 4.5 355B-A32B: https://huggingface.co/zai-org/GLM-4.5
9898 """
9999
100- num_layers : int = 92
100+ num_hidden_layers : int = 92
101101 moe_num_experts : int = 160
102102 hidden_size : int = 5120
103- ffn_hidden_size : int = 12288
103+ intermediate_size : int = 12288
104104 moe_layer_freq : Union [int , List [int ]] = field (
105105 default_factory = lambda : [0 ] * 3 + [1 ] * 89
106106 ) # first three layers are dense
@@ -116,10 +116,10 @@ class GLM45AirModelProvider106B(GLMMoEModelProvider):
116116 Provider for GLM 4.5 Air 106B-A12B: https://huggingface.co/zai-org/GLM-4.5-Air
117117 """
118118
119- num_layers : int = 46
119+ num_hidden_layers : int = 46
120120 moe_num_experts : int = 128
121121 hidden_size : int = 4096
122- ffn_hidden_size : int = 10944
122+ intermediate_size : int = 10944
123123 moe_layer_freq : Union [int , List [int ]] = field (
124124 default_factory = lambda : [0 ] * 1 + [1 ] * 45
125125 ) # first one layer is dense
@@ -135,11 +135,11 @@ class GLM45AirModelDebugProvider(GLM45AirModelProvider106B):
135135 Provider for GLM 4.5 Air 106B-A12B: https://huggingface.co/zai-org/GLM-4.5-Air
136136 """
137137
138- num_layers : int = 10
138+ num_hidden_layers : int = 10
139139 moe_num_shared_experts : int = 1
140140 hidden_size : int = 128
141- ffn_hidden_size : int = 128
141+ intermediate_size : int = 128
142142 moe_intermediate_size : int = 1408
143- mtp_num_layers : Optional [int ] = 0
143+ num_nextn_predict_layers : Optional [int ] = 0
144144 use_bias : bool = False
145145 vocab_size : int = 37888
0 commit comments