Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3526,14 +3526,18 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_vision = new clip_ctx(ctx_params);
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
loader.load_tensors(*ctx_vision);
loader.warmup(*ctx_vision);
if (ctx_params.warmup) {
loader.warmup(*ctx_vision);
}
}

if (loader.has_audio) {
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
loader.warmup(*ctx_audio);
if (ctx_params.warmup) {
loader.warmup(*ctx_audio);
}
}

} catch (const std::exception & e) {
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct clip_context_params {
enum clip_flash_attn_type flash_attn_type;
int image_min_tokens;
int image_max_tokens;
bool warmup;
};

struct clip_init_result {
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ struct mtmd_cli_context {
mparams.print_timings = true;
mparams.n_threads = params.cpuparams.n_threads;
mparams.flash_attn_type = params.flash_attn_type;
mparams.warmup = params.warmup;
mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens;
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ mtmd_context_params mtmd_context_params_default() {
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
/* media_marker */ mtmd_default_marker(),
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* warmup */ true,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
};
Expand Down Expand Up @@ -177,6 +178,7 @@ struct mtmd_context {
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens,
/* warmup */ ctx_params.warmup,
};

auto res = clip_init(mmproj_fname, ctx_clip_params);
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/mtmd.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ struct mtmd_context_params {
const char * image_marker; // deprecated, use media_marker instead
const char * media_marker;
enum llama_flash_attn_type flash_attn_type;
bool warmup; // whether to run a warmup encode pass after initialization

// limit number of image tokens, only for vision models with dynamic resolution
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
Expand Down
1 change: 1 addition & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,7 @@ struct server_context_impl {
mparams.print_timings = false;
mparams.n_threads = params_base.cpuparams.n_threads;
mparams.flash_attn_type = params_base.flash_attn_type;
mparams.warmup = params_base.warmup;
mparams.image_min_tokens = params_base.image_min_tokens;
mparams.image_max_tokens = params_base.image_max_tokens;
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
Expand Down
Loading