You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
yield from super().modify_tensors(gate, mapped_gate, bid)
4128
+
yield from super().modify_tensors(up, mapped_up, bid)
4138
4129
return
4139
4130
4140
4131
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
4141
4132
# skip visual tensors
4142
4133
return
4134
+
4143
4135
if name.find("experts") != -1:
4144
4136
n_experts = self.hparams["num_experts"]
4145
4137
assert bid is not None
@@ -4535,6 +4527,35 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
4535
4527
if name.startswith("model.visual."):
4536
4528
return
4537
4529
4530
+
# Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors
4531
+
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
4532
+
name = name.replace("language_model.", "")
4533
+
mapped = f"{name}.weight" if not name.endswith(".weight") else name
0 commit comments