Skip to content

Commit 13be08d

Browse files
authored
clip : remove boi/eoi embeddings for GLM-edge model (#13081)
1 parent 226251e commit 13be08d

File tree

3 files changed

+6
-18
lines changed

3 files changed

+6
-18
lines changed

examples/llava/clip-impl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@
9090
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
9191
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
9292
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
93-
#define TN_GLM_BOI_W "adapter.boi"
94-
#define TN_GLM_EOI_W "adapter.eoi"
9593

9694
enum projector_type {
9795
PROJECTOR_TYPE_MLP,

examples/llava/clip.cpp

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,6 @@ struct clip_vision_model {
244244
//GLMV-Edge projection
245245
struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
246246
struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
247-
struct ggml_tensor * boi_w = nullptr;
248-
struct ggml_tensor * eoi_w = nullptr;
249247

250248
// MobileVLM projection
251249
struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@@ -1697,8 +1695,6 @@ struct clip_model_loader {
16971695
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
16981696
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
16991697
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1700-
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
1701-
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
17021698
} break;
17031699
case PROJECTOR_TYPE_MERGER:
17041700
{
@@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) {
25932589
}
25942590

25952591
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2596-
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2597-
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2592+
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
25982593
}
25992594

26002595
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27902785
}
27912786
if (ctx->has_glm_projector) {
27922787
GGML_ASSERT(batch_size == 1);
2793-
ggml_tensor * boi = ctx->vision_model.boi_w;
2794-
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
2795-
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
27962788
}
27972789

27982790
// build the inference graph
@@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
30012993
// copy the embeddings to the location passed by the user
30022994
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
30032995

3004-
if (ctx->has_glm_projector) {
3005-
//eoi
3006-
ggml_tensor * eoi = ctx->vision_model.eoi_w;
3007-
int offset = ggml_nelements(embeddings);
3008-
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
3009-
}
3010-
30112996
return true;
30122997
}
30132998

examples/llava/mtmd.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
186186
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
187187
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
188188

189+
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
190+
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
191+
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
192+
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
193+
189194
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
190195
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
191196
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";

0 commit comments

Comments
 (0)