@@ -1514,12 +1514,7 @@ struct clip_graph {
15141514
15151515 ggml_tensor * inp = build_inp ();
15161516
1517- const int n_pos = n_patches;
1518- ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1519- ggml_set_name (positions, " positions" );
1520- ggml_set_input (positions);
1521-
1522- ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
1517+ ggml_tensor * learned_pos_embd = model.position_embeddings ;
15231518
15241519 ggml_tensor * cur = build_vit (
15251520 inp, n_patches,
@@ -1528,18 +1523,12 @@ struct clip_graph {
15281523 learned_pos_embd,
15291524 nullptr );
15301525
1531- cur = ggml_mul_mat (ctx0, model.mm_0_w , cur);
1532- if (model.mm_0_b ) {
1533- cur = ggml_add (ctx0, cur, model.mm_0_b );
1534- }
1535- cb (cur, " aligner_0" , -1 );
1536-
1537- cur = ggml_gelu (ctx0, cur);
1538-
1539- cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1540- if (model.mm_1_b ) {
1541- cur = ggml_add (ctx0, cur, model.mm_1_b );
1542- }
1526+ cur = build_ffn (cur,
1527+ model.mm_0_w , model.mm_0_b ,
1528+ nullptr , nullptr ,
1529+ model.mm_1_w , model.mm_1_b ,
1530+ hparams.ffn_op ,
1531+ -1 );
15431532 cb (cur, " aligner_1" , -1 );
15441533
15451534 // build the graph
@@ -2485,14 +2474,6 @@ struct clip_model_loader {
24852474 hparams.ffn_op = FFN_GELU_ERF;
24862475 log_ffn_op = " gelu_erf" ; // temporary solution for logging
24872476 } break ;
2488- case PROJECTOR_TYPE_JANUS_PRO:
2489- {
2490- // Janus Pro uses mean = std = [0.5, 0.5, 0.5]
2491- // ref: https://huggingface.co/deepseek-community/Janus-Pro-1B/blob/main/preprocessor_config.json
2492- // ref: https://huggingface.co/deepseek-community/Janus-Pro-7B/blob/main/preprocessor_config.json
2493- hparams.image_mean [0 ] = hparams.image_mean [1 ] = hparams.image_mean [2 ] = 0 .5f ;
2494- hparams.image_std [0 ] = hparams.image_std [1 ] = hparams.image_std [2 ] = 0 .5f ;
2495- } break ;
24962477 default :
24972478 break ;
24982479 }
@@ -4356,7 +4337,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
43564337 set_input_i32 (" pos_w" , pos_data);
43574338 } break ;
43584339 case PROJECTOR_TYPE_GLM_EDGE:
4359- case PROJECTOR_TYPE_JANUS_PRO:
43604340 {
43614341 // llava and other models
43624342 std::vector<int32_t > positions (n_pos);
@@ -4394,6 +4374,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
43944374 case PROJECTOR_TYPE_ULTRAVOX:
43954375 case PROJECTOR_TYPE_LFM2:
43964376 case PROJECTOR_TYPE_VOXTRAL:
4377+ case PROJECTOR_TYPE_JANUS_PRO:
43974378 {
43984379 // do nothing
43994380 } break ;
@@ -4481,6 +4462,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44814462 return ctx->model .mm_model_mlp_3_w ->ne [1 ];
44824463 case PROJECTOR_TYPE_QWEN2VL:
44834464 case PROJECTOR_TYPE_QWEN25VL:
4465+ case PROJECTOR_TYPE_JANUS_PRO:
44844466 return ctx->model .mm_1_b ->ne [0 ];
44854467 case PROJECTOR_TYPE_GEMMA3:
44864468 return ctx->model .mm_input_proj_w ->ne [0 ];
@@ -4498,8 +4480,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44984480 case PROJECTOR_TYPE_LFM2:
44994481 case PROJECTOR_TYPE_KIMIVL:
45004482 return ctx->model .mm_2_w ->ne [1 ];
4501- case PROJECTOR_TYPE_JANUS_PRO:
4502- return ctx->model .mm_1_w ->ne [1 ];
45034483 default :
45044484 GGML_ABORT (" Unknown projector type" );
45054485 }
0 commit comments