@@ -3191,7 +3191,8 @@ def set_gguf_parameters(self):
31913191 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
31923192 num_heads = self .hparams ["num_attention_heads" ]
31933193 num_kv_heads = self .hparams ["num_key_value_heads" ]
3194- head_dim = self .hparams ["head_dim" ]
3194+ if (head_dim := self .hparams .get ("head_dim" )) is None :
3195+ head_dim = self .hparams ["hidden_size" ] // num_heads
31953196
31963197 if "ernie." in name :
31973198 name = name .replace ("ernie." , "model." )
@@ -3224,6 +3225,92 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
32243225 return [(self .map_tensor_name (name ), data_torch )]
32253226
32263227
3228+ @ModelBase .register ("Ernie4_5_MoeForCausalLM" )
3229+ class Ernie4_5MoeModel (Ernie4_5Model ):
3230+ model_arch = gguf .MODEL_ARCH .ERNIE4_5_MOE
3231+ _experts : list [dict [str , Tensor ]] | None = None
3232+
3233+ def __init__ (self , * args , ** kwargs ):
3234+ super ().__init__ (* args , ** kwargs )
3235+ self ._experts = [{} for _ in range (self .block_count )]
3236+
3237+ def set_gguf_parameters (self ):
3238+ super ().set_gguf_parameters ()
3239+ self .gguf_writer .add_expert_count (self .hparams ["moe_num_experts" ])
3240+ self .gguf_writer .add_expert_used_count (self .hparams ["moe_k" ])
3241+ self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["moe_layer_interval" ])
3242+ self .gguf_writer .add_leading_dense_block_count (self .hparams ["moe_layer_start_index" ])
3243+ self .gguf_writer .add_rope_freq_base (self .hparams ["rope_theta" ])
3244+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
3245+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
3246+ if (shared_expert_intermediate_size := self .hparams .get ('intermediate_size' )) is not None and (num_key_value_heads := self .hparams .get ('num_key_value_heads' )) is not None :
3247+ self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size // num_key_value_heads )
3248+
3249+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3250+ # Modify correction bias name as in DeepseekV2
3251+ if name .endswith ("e_score_correction_bias" ):
3252+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
3253+
3254+ # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
3255+ match = re .match (r"model.mtp_block.(\d+)" , name )
3256+ if match :
3257+ return []
3258+
3259+ # skip all other MTP tensors for now
3260+ match = re .match (r"model.mtp_emb_norm.(\d+)" , name )
3261+ if match :
3262+ return []
3263+
3264+ match = re .match (r"model.mtp_hidden_norm.(\d+)" , name )
3265+ if match :
3266+ return []
3267+
3268+ match = re .match (r"model.mtp_linear_proj.(\d+)" , name )
3269+ if match :
3270+ return []
3271+
3272+ # process the experts separately
3273+ if name .find ("mlp.experts" ) != - 1 :
3274+ n_experts = self .hparams ["moe_num_experts" ]
3275+ assert bid is not None
3276+
3277+ if self ._experts is None :
3278+ self ._experts = [{} for _ in range (self .block_count )]
3279+
3280+ self ._experts [bid ][name ] = data_torch
3281+
3282+ if len (self ._experts [bid ]) >= n_experts * 3 :
3283+ tensors : list [tuple [str , Tensor ]] = []
3284+
3285+ # merge the experts into a single 3d tensor
3286+ for w_name in ["gate_proj" , "up_proj" , "down_proj" ]:
3287+ datas : list [Tensor ] = []
3288+
3289+ for xid in range (n_experts ):
3290+ ename_to_retrieve = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
3291+ datas .append (self ._experts [bid ][ename_to_retrieve ])
3292+ del self ._experts [bid ][ename_to_retrieve ]
3293+
3294+ data_torch = torch .stack (datas , dim = 0 )
3295+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
3296+ new_name = self .map_tensor_name (merged_name )
3297+ tensors .append ((new_name , data_torch ))
3298+
3299+ return tensors
3300+ else :
3301+ return []
3302+ return [(self .map_tensor_name (name ), data_torch )]
3303+
3304+ def prepare_tensors (self ):
3305+ super ().prepare_tensors ()
3306+
3307+ if self ._experts is not None :
3308+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3309+ experts = [k for d in self ._experts for k in d .keys ()]
3310+ if len (experts ) > 0 :
3311+ raise ValueError (f"Unprocessed experts: { experts } " )
3312+
3313+
32273314@ModelBase .register (
32283315 "Qwen2VLModel" ,
32293316 "Qwen2VLForConditionalGeneration" ,
0 commit comments