Skip to content

Commit 5794785

Browse files
ravenousengxson
andcommitted
Update clip model handling
Co-authored-by: Xuan-Son Nguyen <son@huggingface.co>
1 parent e260b0e commit 5794785

File tree

1 file changed

+9
-29
lines changed

1 file changed

+9
-29
lines changed

tools/mtmd/clip.cpp

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,12 +1514,7 @@ struct clip_graph {
15141514

15151515
ggml_tensor * inp = build_inp();
15161516

1517-
const int n_pos = n_patches;
1518-
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1519-
ggml_set_name(positions, "positions");
1520-
ggml_set_input(positions);
1521-
1522-
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
1517+
ggml_tensor * learned_pos_embd = model.position_embeddings;
15231518

15241519
ggml_tensor * cur = build_vit(
15251520
inp, n_patches,
@@ -1528,18 +1523,12 @@ struct clip_graph {
15281523
learned_pos_embd,
15291524
nullptr);
15301525

1531-
cur = ggml_mul_mat(ctx0, model.mm_0_w, cur);
1532-
if (model.mm_0_b) {
1533-
cur = ggml_add(ctx0, cur, model.mm_0_b);
1534-
}
1535-
cb(cur, "aligner_0", -1);
1536-
1537-
cur = ggml_gelu(ctx0, cur);
1538-
1539-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1540-
if (model.mm_1_b) {
1541-
cur = ggml_add(ctx0, cur, model.mm_1_b);
1542-
}
1526+
cur = build_ffn(cur,
1527+
model.mm_0_w, model.mm_0_b,
1528+
nullptr, nullptr,
1529+
model.mm_1_w, model.mm_1_b,
1530+
hparams.ffn_op,
1531+
-1);
15431532
cb(cur, "aligner_1", -1);
15441533

15451534
// build the graph
@@ -2485,14 +2474,6 @@ struct clip_model_loader {
24852474
hparams.ffn_op = FFN_GELU_ERF;
24862475
log_ffn_op = "gelu_erf"; // temporary solution for logging
24872476
} break;
2488-
case PROJECTOR_TYPE_JANUS_PRO:
2489-
{
2490-
// Janus Pro uses mean = std = [0.5, 0.5, 0.5]
2491-
// ref: https://huggingface.co/deepseek-community/Janus-Pro-1B/blob/main/preprocessor_config.json
2492-
// ref: https://huggingface.co/deepseek-community/Janus-Pro-7B/blob/main/preprocessor_config.json
2493-
hparams.image_mean[0] = hparams.image_mean[1] = hparams.image_mean[2] = 0.5f;
2494-
hparams.image_std[0] = hparams.image_std[1] = hparams.image_std[2] = 0.5f;
2495-
} break;
24962477
default:
24972478
break;
24982479
}
@@ -4356,7 +4337,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
43564337
set_input_i32("pos_w", pos_data);
43574338
} break;
43584339
case PROJECTOR_TYPE_GLM_EDGE:
4359-
case PROJECTOR_TYPE_JANUS_PRO:
43604340
{
43614341
// llava and other models
43624342
std::vector<int32_t> positions(n_pos);
@@ -4394,6 +4374,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
43944374
case PROJECTOR_TYPE_ULTRAVOX:
43954375
case PROJECTOR_TYPE_LFM2:
43964376
case PROJECTOR_TYPE_VOXTRAL:
4377+
case PROJECTOR_TYPE_JANUS_PRO:
43974378
{
43984379
// do nothing
43994380
} break;
@@ -4481,6 +4462,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44814462
return ctx->model.mm_model_mlp_3_w->ne[1];
44824463
case PROJECTOR_TYPE_QWEN2VL:
44834464
case PROJECTOR_TYPE_QWEN25VL:
4465+
case PROJECTOR_TYPE_JANUS_PRO:
44844466
return ctx->model.mm_1_b->ne[0];
44854467
case PROJECTOR_TYPE_GEMMA3:
44864468
return ctx->model.mm_input_proj_w->ne[0];
@@ -4498,8 +4480,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44984480
case PROJECTOR_TYPE_LFM2:
44994481
case PROJECTOR_TYPE_KIMIVL:
45004482
return ctx->model.mm_2_w->ne[1];
4501-
case PROJECTOR_TYPE_JANUS_PRO:
4502-
return ctx->model.mm_1_w->ne[1];
45034483
default:
45044484
GGML_ABORT("Unknown projector type");
45054485
}

0 commit comments

Comments
 (0)