Skip to content

Commit a2d0199

Browse files
ggerganovpwilkin
authored andcommitted
common : update presets (ggml-org#16504)
* presets : add --embd-gemma-default and remove old embedding presets * presets : add gpt-oss presets * presets : add vision presets * cont : remove reasoning overrides [no ci] * cont : fix batch size for embedding gemma [no ci]
1 parent 4a25717 commit a2d0199

File tree

2 files changed

+163
-135
lines changed

2 files changed

+163
-135
lines changed

common/arg.cpp

Lines changed: 162 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -3365,7 +3365,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
33653365
add_opt(common_arg(
33663366
{"--chat-template-kwargs"}, "STRING",
33673367
string_format("sets additional params for the json template parser"),
3368-
[](common_params & params, const std::string & value) {
3368+
[](common_params & params, const std::string & value) {
33693369
auto parsed = json::parse(value);
33703370
for (const auto & item : parsed.items()) {
33713371
params.default_template_kwargs[item.key()] = item.value().dump();
@@ -3577,21 +3577,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35773577
common_log_set_file(common_log_main(), value.c_str());
35783578
}
35793579
));
3580-
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3581-
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3582-
"'auto' enables colors when output is to a terminal",
3583-
[](common_params &, const std::string & value) {
3584-
if (is_truthy(value)) {
3585-
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3586-
} else if (is_falsey(value)) {
3587-
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3588-
} else if (is_autoy(value)) {
3589-
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3590-
} else {
3591-
throw std::invalid_argument(
3592-
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3593-
}
3594-
}).set_env("LLAMA_LOG_COLORS"));
3580+
add_opt(common_arg(
3581+
{"--log-colors"}, "[on|off|auto]",
3582+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3583+
"'auto' enables colors when output is to a terminal",
3584+
[](common_params &, const std::string & value) {
3585+
if (is_truthy(value)) {
3586+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3587+
} else if (is_falsey(value)) {
3588+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3589+
} else if (is_autoy(value)) {
3590+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3591+
} else {
3592+
throw std::invalid_argument(
3593+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3594+
}
3595+
}
3596+
).set_env("LLAMA_LOG_COLORS"));
35953597
add_opt(common_arg(
35963598
{"-v", "--verbose", "--log-verbose"},
35973599
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3857,7 +3859,87 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38573859
}
38583860
).set_examples({LLAMA_EXAMPLE_TTS}));
38593861

3860-
// model-specific
3862+
add_opt(common_arg(
3863+
{"--diffusion-steps"}, "N",
3864+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3865+
[](common_params & params, int value) { params.diffusion.steps = value; }
3866+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3867+
add_opt(common_arg(
3868+
{"--diffusion-visual"},
3869+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3870+
[](common_params & params) { params.diffusion.visual_mode = true; }
3871+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3872+
add_opt(common_arg(
3873+
{"--diffusion-eps"}, "F",
3874+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3875+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3876+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3877+
add_opt(common_arg(
3878+
{"--diffusion-algorithm"}, "N",
3879+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3880+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
3881+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3882+
add_opt(common_arg(
3883+
{"--diffusion-alg-temp"}, "F",
3884+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3885+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3886+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3887+
add_opt(common_arg(
3888+
{"--diffusion-block-length"}, "N",
3889+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3890+
[](common_params & params, int value) { params.diffusion.block_length = value; }
3891+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3892+
add_opt(common_arg(
3893+
{"--diffusion-cfg-scale"}, "F",
3894+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3895+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3896+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3897+
add_opt(common_arg(
3898+
{"--diffusion-add-gumbel-noise"}, "F",
3899+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3900+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3901+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3902+
add_opt(common_arg(
3903+
{ "-lr", "--learning-rate" }, "ALPHA",
3904+
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3905+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3906+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3907+
add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3908+
string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3909+
(double) params.lr.lr_min),
3910+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3911+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3912+
add_opt(common_arg(
3913+
{"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3914+
string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3915+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3916+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3917+
add_opt(common_arg(
3918+
{"-wd", "--weight-decay"}, "WD",
3919+
string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3920+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3921+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3922+
add_opt(common_arg(
3923+
{"-val-split", "--val-split"}, "FRACTION",
3924+
string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3925+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3926+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3927+
add_opt(common_arg(
3928+
{"-epochs", "--epochs"}, "N",
3929+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3930+
[](common_params & params, int epochs) { params.lr.epochs = epochs; }
3931+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3932+
add_opt(common_arg(
3933+
{"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3934+
[](common_params & params, const std::string & name) {
3935+
params.optimizer = common_opt_get_optimizer(name.c_str());
3936+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3937+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3938+
}
3939+
}
3940+
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3941+
3942+
// presets
38613943
add_opt(common_arg(
38623944
{"--tts-oute-default"},
38633945
string_format("use default OuteTTS models (note: can download weights from the internet)"),
@@ -3870,39 +3952,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38703952
).set_examples({LLAMA_EXAMPLE_TTS}));
38713953

38723954
add_opt(common_arg(
3873-
{"--embd-bge-small-en-default"},
3874-
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
3875-
[](common_params & params) {
3876-
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3877-
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3878-
params.embd_normalize = 2;
3879-
params.n_ctx = 512;
3880-
params.verbose_prompt = true;
3881-
params.embedding = true;
3882-
}
3883-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3884-
3885-
add_opt(common_arg(
3886-
{"--embd-e5-small-en-default"},
3887-
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
3888-
[](common_params & params) {
3889-
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
3890-
params.model.hf_file = "e5-small-v2-q8_0.gguf";
3891-
params.embd_normalize = 2;
3892-
params.n_ctx = 512;
3893-
params.verbose_prompt = true;
3894-
params.embedding = true;
3895-
}
3896-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3897-
3898-
add_opt(common_arg(
3899-
{"--embd-gte-small-default"},
3900-
string_format("use default gte-small model (note: can download weights from the internet)"),
3955+
{"--embd-gemma-default"},
3956+
string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
39013957
[](common_params & params) {
3902-
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
3903-
params.model.hf_file = "gte-small-q8_0.gguf";
3904-
params.embd_normalize = 2;
3905-
params.n_ctx = 512;
3958+
params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3959+
params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3960+
params.port = 8011;
3961+
params.n_ubatch = 2048;
3962+
params.n_batch = 2048;
3963+
params.n_parallel = 32;
3964+
params.n_ctx = 2048*params.n_parallel;
39063965
params.verbose_prompt = true;
39073966
params.embedding = true;
39083967
}
@@ -3997,96 +4056,65 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
39974056
).set_examples({LLAMA_EXAMPLE_SERVER}));
39984057

39994058
add_opt(common_arg(
4000-
{ "--diffusion-steps" }, "N",
4001-
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
4002-
[](common_params & params, int value) { params.diffusion.steps = value; }
4003-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4004-
add_opt(common_arg(
4005-
{ "--diffusion-visual" },
4006-
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
4007-
params.diffusion.visual_mode ? "true" : "false"),
4008-
[](common_params & params) { params.diffusion.visual_mode = true; }
4009-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4059+
{"--gpt-oss-20b-default"},
4060+
string_format("use gpt-oss-20b (note: can download weights from the internet)"),
4061+
[](common_params & params) {
4062+
params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
4063+
params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
4064+
params.port = 8013;
4065+
params.n_ubatch = 2048;
4066+
params.n_batch = 32768;
4067+
params.n_parallel = 2;
4068+
params.n_ctx = 131072*params.n_parallel;
4069+
params.sampling.temp = 1.0f;
4070+
params.sampling.top_p = 1.0f;
4071+
params.sampling.top_k = 0;
4072+
params.sampling.min_p = 0.01f;
4073+
params.use_jinja = true;
4074+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4075+
}
4076+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40104077

40114078
add_opt(common_arg(
4012-
{ "--diffusion-eps" }, "F",
4013-
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
4014-
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
4015-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4016-
add_opt(common_arg(
4017-
{ "--diffusion-algorithm" }, "N",
4018-
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
4019-
params.diffusion.algorithm),
4020-
[](common_params & params, int value) { params.diffusion.algorithm = value; }
4021-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4022-
add_opt(common_arg(
4023-
{ "--diffusion-alg-temp" }, "F",
4024-
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
4025-
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
4026-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4079+
{"--gpt-oss-120b-default"},
4080+
string_format("use gpt-oss-120b (note: can download weights from the internet)"),
4081+
[](common_params & params) {
4082+
params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
4083+
params.port = 8013;
4084+
params.n_ubatch = 2048;
4085+
params.n_batch = 32768;
4086+
params.n_parallel = 2;
4087+
params.n_ctx = 131072*params.n_parallel;
4088+
params.sampling.temp = 1.0f;
4089+
params.sampling.top_p = 1.0f;
4090+
params.sampling.top_k = 0;
4091+
params.sampling.min_p = 0.01f;
4092+
params.use_jinja = true;
4093+
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
4094+
}
4095+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40274096

40284097
add_opt(common_arg(
4029-
{ "--diffusion-block-length" }, "N",
4030-
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
4031-
[](common_params & params, int value) { params.diffusion.block_length = value; }
4032-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4033-
add_opt(common_arg(
4034-
{ "--diffusion-cfg-scale" }, "F",
4035-
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
4036-
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
4037-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4038-
add_opt(common_arg(
4039-
{ "--diffusion-add-gumbel-noise" }, "F",
4040-
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
4041-
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
4042-
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
4043-
4098+
{"--vision-gemma-4b-default"},
4099+
string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
4100+
[](common_params & params) {
4101+
params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
4102+
params.port = 8014;
4103+
params.n_ctx = 0;
4104+
params.use_jinja = true;
4105+
}
4106+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40444107

4045-
add_opt(
4046-
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
4047-
string_format(
4048-
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
4049-
(double) params.lr.lr0),
4050-
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
4051-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4052-
add_opt(
4053-
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
4054-
string_format(
4055-
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
4056-
(double) params.lr.lr_min),
4057-
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
4058-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4059-
add_opt(
4060-
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
4061-
string_format(
4062-
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
4063-
(double) params.lr.decay_epochs),
4064-
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
4065-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4066-
add_opt(common_arg(
4067-
{ "-wd", "--weight-decay" }, "WD",
4068-
string_format(
4069-
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
4070-
(double) params.lr.wd),
4071-
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
4072-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4073-
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
4074-
string_format("fraction of data to use as validation set for training (default: %.2g).",
4075-
(double) params.val_split),
4076-
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
4077-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4078-
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
4079-
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
4080-
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
4081-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4082-
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
4083-
[](common_params & params, const std::string & name) {
4084-
params.optimizer = common_opt_get_optimizer(name.c_str());
4085-
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
4086-
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
4087-
}
4088-
})
4089-
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
4108+
add_opt(common_arg(
4109+
{"--vision-gemma-12b-default"},
4110+
string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
4111+
[](common_params & params) {
4112+
params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
4113+
params.port = 8014;
4114+
params.n_ctx = 0;
4115+
params.use_jinja = true;
4116+
}
4117+
).set_examples({LLAMA_EXAMPLE_SERVER}));
40904118

40914119
return ctx_arg;
40924120
}

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ struct common_params {
428428
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
429429
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
430430
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
431-
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
431+
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
432432

433433
std::string hostname = "127.0.0.1";
434434
std::string public_path = ""; // NOLINT

0 commit comments

Comments
 (0)