@@ -902,10 +902,12 @@ class TensorNameMap:
902902
903903 MODEL_TENSOR .V_MMPROJ_FC : (
904904 "model.connector.modality_projection.proj" , # SmolVLM
905+ "multi_modal_projector.linear_1" , # llama 4
905906 ),
906907
907908 MODEL_TENSOR .V_MMPROJ_MLP : (
908909 "model.mm_projector.mlp.mlp.{bid}" ,
910+ "vision_model.vision_adapter.mlp.fc{bid}" , # llama 4
909911 "mlp1.{bid}" , # InternVL
910912 ),
911913
@@ -915,26 +917,30 @@ class TensorNameMap:
915917
916918 MODEL_TENSOR .V_ENC_EMBD_CLS : (
917919 "vision_tower.vision_model.embeddings.class_embedding" ,
920+ "vision_model.class_embedding" , # llama 4
918921 ),
919922
920923 MODEL_TENSOR .V_ENC_EMBD_PATCH : (
921924 "vision_tower.vision_model.embeddings.patch_embedding" ,
922925 "vpm.embeddings.patch_embedding" ,
923926 "model.vision_model.embeddings.patch_embedding" , # SmolVLM
924927 "vision_tower.patch_conv" , # pixtral
928+ "vision_model.patch_embedding.linear" , # llama 4
925929 "visual.patch_embed.proj" , # qwen2vl
926930 ),
927931
928932 MODEL_TENSOR .V_ENC_EMBD_POS : (
929933 "vision_tower.vision_model.embeddings.position_embedding" ,
930934 "vpm.embeddings.position_embedding" ,
931935 "model.vision_model.embeddings.position_embedding" , # SmolVLM
936+ "vision_model.positional_embedding_vlm" , # llama 4
932937 ),
933938
934939 MODEL_TENSOR .V_ENC_ATTN_Q : (
935940 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj" ,
936941 "vpm.encoder.layers.{bid}.self_attn.q_proj" ,
937942 "model.vision_model.encoder.layers.{bid}.self_attn.q_proj" , # SmolVLM
943+ "vision_model.model.layers.{bid}.self_attn.q_proj" , # llama4
938944 "vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral
939945 "visual.blocks.{bid}.attn.q" , # qwen2vl, generated
940946 ),
@@ -947,6 +953,7 @@ class TensorNameMap:
947953 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj" ,
948954 "vpm.encoder.layers.{bid}.self_attn.k_proj" ,
949955 "model.vision_model.encoder.layers.{bid}.self_attn.k_proj" , # SmolVLM
956+ "vision_model.model.layers.{bid}.self_attn.k_proj" , # llama4
950957 "vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral
951958 "visual.blocks.{bid}.attn.k" , # qwen2vl, generated
952959 ),
@@ -959,6 +966,7 @@ class TensorNameMap:
959966 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj" ,
960967 "vpm.encoder.layers.{bid}.self_attn.v_proj" ,
961968 "model.vision_model.encoder.layers.{bid}.self_attn.v_proj" , # SmolVLM
969+ "vision_model.model.layers.{bid}.self_attn.v_proj" , # llama4
962970 "vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral
963971 "visual.blocks.{bid}.attn.v" , # qwen2vl, generated
964972 ),
@@ -969,23 +977,26 @@ class TensorNameMap:
969977 "vpm.encoder.layers.{bid}.layer_norm1" ,
970978 "model.vision_model.encoder.layers.{bid}.layer_norm1" , # SmolVLM
971979 "vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral
980+ "vision_model.model.layers.{bid}.input_layernorm" , # llama4
972981 "visual.blocks.{bid}.norm1" , # qwen2vl
973982 ),
974983
975- MODEL_TENSOR .V_ENC_OUTPUT : (
984+ MODEL_TENSOR .V_ENC_ATTN_O : (
976985 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj" ,
977986 "vision_tower.vision_model.encoder.layers.{bid}.attn.proj" , # InternVL
978987 "vpm.encoder.layers.{bid}.self_attn.out_proj" ,
979988 "model.vision_model.encoder.layers.{bid}.self_attn.out_proj" , # SmolVLM
989+ "vision_model.model.layers.{bid}.self_attn.o_proj" , # llama4
980990 "vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral
981991 "visual.blocks.{bid}.attn.proj" , # qwen2vl
982992 ),
983993
984- MODEL_TENSOR .V_ENC_OUTPUT_NORM : (
994+ MODEL_TENSOR .V_ENC_POST_ATTN_NORM : (
985995 "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2" ,
986996 "vision_tower.vision_model.encoder.layers.{bid}.norm2" , # InternVL
987997 "vpm.encoder.layers.{bid}.layer_norm2" ,
988998 "model.vision_model.encoder.layers.{bid}.layer_norm2" , # SmolVLM
999+ "vision_model.model.layers.{bid}.post_attention_layernorm" , # llama4
9891000 "vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral
9901001 "visual.blocks.{bid}.norm2" , # qwen2vl
9911002 ),
@@ -995,6 +1006,7 @@ class TensorNameMap:
9951006 "vpm.encoder.layers.{bid}.mlp.fc1" ,
9961007 "model.vision_model.encoder.layers.{bid}.mlp.fc1" , # SmolVLM, gemma3
9971008 "vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral
1009+ "vision_model.model.layers.{bid}.mlp.fc1" , # llama4
9981010 "visual.blocks.{bid}.mlp.fc1" , # qwen2vl
9991011 "visual.blocks.{bid}.mlp.up_proj" , # qwen2.5vl
10001012 ),
@@ -1009,6 +1021,7 @@ class TensorNameMap:
10091021 "vpm.encoder.layers.{bid}.mlp.fc2" ,
10101022 "model.vision_model.encoder.layers.{bid}.mlp.fc2" , # SmolVLM, gemma3
10111023 "vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral
1024+ "vision_model.model.layers.{bid}.mlp.fc2" , # llama4
10121025 "visual.blocks.{bid}.mlp.fc2" , # qwen2vl
10131026 "visual.blocks.{bid}.mlp.down_proj" , # qwen2.5vl
10141027 ),
@@ -1024,11 +1037,13 @@ class TensorNameMap:
10241037 MODEL_TENSOR .V_PRE_NORM : (
10251038 "vision_tower.vision_model.pre_layrnorm" ,
10261039 "vision_tower.ln_pre" , # pixtral
1040+ "vision_model.layernorm_pre" , # llama4
10271041 ),
10281042
10291043 MODEL_TENSOR .V_POST_NORM : (
10301044 "vision_tower.vision_model.post_layernorm" ,
10311045 "model.vision_model.post_layernorm" , # SmolVLM
1046+ "vision_model.layernorm_post" , # llama4
10321047 "visual.merger.ln_q" , # qwen2vl
10331048 ),
10341049
0 commit comments