Skip to content

Commit ecf74a8

Browse files
authored
mtmd: add mtmd_context_params::warmup option (#17652)
* mtmd: add mtmd_context_params::warmup option * reuse the common_params::warmup
1 parent 00c361f commit ecf74a8

File tree

6 files changed

+12
-2
lines changed

6 files changed

+12
-2
lines changed

tools/mtmd/clip.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3526,14 +3526,18 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
35263526
ctx_vision = new clip_ctx(ctx_params);
35273527
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
35283528
loader.load_tensors(*ctx_vision);
3529-
loader.warmup(*ctx_vision);
3529+
if (ctx_params.warmup) {
3530+
loader.warmup(*ctx_vision);
3531+
}
35303532
}
35313533

35323534
if (loader.has_audio) {
35333535
ctx_audio = new clip_ctx(ctx_params);
35343536
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
35353537
loader.load_tensors(*ctx_audio);
3536-
loader.warmup(*ctx_audio);
3538+
if (ctx_params.warmup) {
3539+
loader.warmup(*ctx_audio);
3540+
}
35373541
}
35383542

35393543
} catch (const std::exception & e) {

tools/mtmd/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct clip_context_params {
3434
enum clip_flash_attn_type flash_attn_type;
3535
int image_min_tokens;
3636
int image_max_tokens;
37+
bool warmup;
3738
};
3839

3940
struct clip_init_result {

tools/mtmd/mtmd-cli.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ struct mtmd_cli_context {
136136
mparams.print_timings = true;
137137
mparams.n_threads = params.cpuparams.n_threads;
138138
mparams.flash_attn_type = params.flash_attn_type;
139+
mparams.warmup = params.warmup;
139140
mparams.image_min_tokens = params.image_min_tokens;
140141
mparams.image_max_tokens = params.image_max_tokens;
141142
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));

tools/mtmd/mtmd.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ mtmd_context_params mtmd_context_params_default() {
108108
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
109109
/* media_marker */ mtmd_default_marker(),
110110
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
111+
/* warmup */ true,
111112
/* image_min_tokens */ -1,
112113
/* image_max_tokens */ -1,
113114
};
@@ -177,6 +178,7 @@ struct mtmd_context {
177178
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
178179
/* image_min_tokens */ ctx_params.image_min_tokens,
179180
/* image_max_tokens */ ctx_params.image_max_tokens,
181+
/* warmup */ ctx_params.warmup,
180182
};
181183

182184
auto res = clip_init(mmproj_fname, ctx_clip_params);

tools/mtmd/mtmd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ struct mtmd_context_params {
8282
const char * image_marker; // deprecated, use media_marker instead
8383
const char * media_marker;
8484
enum llama_flash_attn_type flash_attn_type;
85+
bool warmup; // whether to run a warmup encode pass after initialization
8586

8687
// limit number of image tokens, only for vision models with dynamic resolution
8788
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)

tools/server/server-context.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,7 @@ struct server_context_impl {
621621
mparams.print_timings = false;
622622
mparams.n_threads = params_base.cpuparams.n_threads;
623623
mparams.flash_attn_type = params_base.flash_attn_type;
624+
mparams.warmup = params_base.warmup;
624625
mparams.image_min_tokens = params_base.image_min_tokens;
625626
mparams.image_max_tokens = params_base.image_max_tokens;
626627
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);

0 commit comments

Comments
 (0)