Skip to content

Commit 797990c

Browse files
authored
mtmd : add ultravox audio input (#13623)
* convert ok, load ok * warmup ok * test * still does not work? * fix padding * temporary give up * fix merge conflict * build_ultravox() * rm test * fix merge conflict * add necessary mtmd APIs * first working version (only 4s of audio) * will this monster compile? * fix compile * please compile * fPIC * fix windows * various fixes * clean up audio_helpers * fix conversion * add some debug stuff * long audio input ok * adapt the api * add --audio arg * final touch UX * add miniaudio to readme * fix typo * refactor kv metadata * mtmd_default_marker()
1 parent ab86335 commit 797990c

21 files changed

+95402
-260
lines changed

.editorconfig

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,7 @@ end_of_line = unset
4848
charset = unset
4949
trim_trailing_whitespace = unset
5050
insert_final_newline = unset
51+
52+
[tools/mtmd/miniaudio.h]
53+
trim_trailing_whitespace = unset
54+
insert_final_newline = unset

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,3 +580,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
580580
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
581581
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
582582
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
583+
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain

common/arg.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
using json = nlohmann::ordered_json;
4040

4141
std::initializer_list<enum llama_example> mmproj_examples = {
42-
LLAMA_EXAMPLE_LLAVA,
42+
LLAMA_EXAMPLE_MTMD,
4343
LLAMA_EXAMPLE_SERVER,
4444
};
4545

@@ -2233,12 +2233,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22332233
}
22342234
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
22352235
add_opt(common_arg(
2236-
{"--image"}, "FILE",
2237-
"path to an image file. use with multimodal models. Specify multiple times for batching",
2236+
{"--image", "--audio"}, "FILE",
2237+
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
22382238
[](common_params & params, const std::string & value) {
22392239
params.image.emplace_back(value);
22402240
}
2241-
).set_examples({LLAMA_EXAMPLE_LLAVA}));
2241+
).set_examples({LLAMA_EXAMPLE_MTMD}));
22422242
if (llama_supports_rpc()) {
22432243
add_opt(common_arg(
22442244
{"--rpc"}, "SERVERS",
@@ -2868,7 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28682868
[](common_params & params, const std::string & value) {
28692869
params.chat_template = value;
28702870
}
2871-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
2871+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
28722872
add_opt(common_arg(
28732873
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
28742874
string_format(

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ enum llama_example {
7676
LLAMA_EXAMPLE_SERVER,
7777
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
7878
LLAMA_EXAMPLE_EXPORT_LORA,
79-
LLAMA_EXAMPLE_LLAVA,
79+
LLAMA_EXAMPLE_MTMD,
8080
LLAMA_EXAMPLE_LOOKUP,
8181
LLAMA_EXAMPLE_PARALLEL,
8282
LLAMA_EXAMPLE_TTS,

convert_hf_to_gguf.py

Lines changed: 118 additions & 42 deletions
Large diffs are not rendered by default.

docs/multimodal.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
44
- [llama-mtmd-cli](../tools/mtmd/README.md)
55
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
66

7-
To enable it, can use use one of the 2 methods below:
7+
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
8+
9+
To enable it, you can use one of the 2 methods below:
810

911
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
1012
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
@@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,
3739

3840
NOTE: some models may require large context window, for example: `-c 8192`
3941

42+
**Vision models**:
43+
4044
```sh
4145
# Gemma 3
4246
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
7882
# Llama 4 Scout
7983
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
8084
```
85+
86+
**Audio models**:
87+
88+
```sh
89+
# Ultravox 0.5
90+
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
91+
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
92+
```

gguf-py/gguf/constants.py

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,13 @@ class Adapter:
219219
TYPE = "adapter.type"
220220
LORA_ALPHA = "adapter.lora.alpha"
221221

222-
class ClipVision:
222+
class Clip:
223223
PROJECTOR_TYPE = "clip.projector_type"
224224
HAS_VISION_ENCODER = "clip.has_vision_encoder"
225+
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
225226
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
227+
228+
class ClipVision:
226229
IMAGE_SIZE = "clip.vision.image_size"
227230
PATCH_SIZE = "clip.vision.patch_size"
228231
EMBEDDING_LENGTH = "clip.vision.embedding_length"
@@ -243,19 +246,33 @@ class Attention:
243246
class Projector:
244247
SCALE_FACTOR = "clip.vision.projector.scale_factor"
245248

249+
class ClipAudio:
250+
NUM_MEL_BINS = "clip.audio.num_mel_bins"
251+
EMBEDDING_LENGTH = "clip.audio.embedding_length"
252+
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
253+
PROJECTION_DIM = "clip.audio.projection_dim"
254+
BLOCK_COUNT = "clip.audio.block_count"
255+
256+
class Attention:
257+
HEAD_COUNT = "clip.audio.attention.head_count"
258+
LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon"
259+
260+
class Projector:
261+
STACK_FACTOR = "clip.audio.projector.stack_factor"
262+
246263
#
247264
# recommended mapping of model tensor names for storage in gguf
248265
#
249266

250267

251268
class GGUFType:
252-
MODEL = "model"
253-
ADAPTER = "adapter"
254-
CLIP_VISION = "clip-vision"
269+
MODEL = "model"
270+
ADAPTER = "adapter"
271+
MMPROJ = "mmproj" # dummy, unused for now
255272

256273

257274
class MODEL_ARCH(IntEnum):
258-
CLIP_VISION = auto() # dummy arch for clip.cpp
275+
MMPROJ = auto() # dummy arch for clip.cpp
259276
LLAMA = auto()
260277
LLAMA4 = auto()
261278
DECI = auto()
@@ -514,10 +531,27 @@ class MODEL_TENSOR(IntEnum):
514531
V_RESMPL_QUERY = auto() # minicpmv
515532
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
516533
V_MM_PATCH_MERGER = auto() # mistral small 3.1
534+
# audio (mtmd)
535+
A_ENC_EMBD_POS = auto()
536+
A_ENC_CONV1D = auto()
537+
A_PRE_NORM = auto()
538+
A_POST_NORM = auto()
539+
A_ENC_ATTN_Q = auto()
540+
A_ENC_ATTN_K = auto()
541+
A_ENC_ATTN_V = auto()
542+
A_ENC_INPUT_NORM = auto()
543+
A_ENC_OUTPUT = auto()
544+
A_ENC_OUTPUT_NORM = auto()
545+
A_ENC_FFN_UP = auto()
546+
A_ENC_FFN_GATE = auto()
547+
A_ENC_FFN_DOWN = auto()
548+
A_MMPROJ = auto()
549+
A_MM_NORM_PRE = auto()
550+
A_MM_NORM_MID = auto()
517551

518552

519553
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
520-
MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp
554+
MODEL_ARCH.MMPROJ: "clip", # dummy arch for clip.cpp
521555
MODEL_ARCH.LLAMA: "llama",
522556
MODEL_ARCH.LLAMA4: "llama4",
523557
MODEL_ARCH.DECI: "deci",
@@ -776,10 +810,27 @@ class MODEL_TENSOR(IntEnum):
776810
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
777811
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
778812
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
813+
# audio (mtmd)
814+
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
815+
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
816+
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
817+
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
818+
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
819+
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
820+
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
821+
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
822+
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
823+
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
824+
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
825+
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
826+
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
827+
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
828+
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
829+
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
779830
}
780831

781832
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
782-
MODEL_ARCH.CLIP_VISION: [
833+
MODEL_ARCH.MMPROJ: [
783834
MODEL_TENSOR.V_MMPROJ,
784835
MODEL_TENSOR.V_MMPROJ_FC,
785836
MODEL_TENSOR.V_MMPROJ_MLP,
@@ -819,6 +870,23 @@ class MODEL_TENSOR(IntEnum):
819870
MODEL_TENSOR.V_RESMPL_QUERY,
820871
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
821872
MODEL_TENSOR.V_MM_PATCH_MERGER,
873+
# audio
874+
MODEL_TENSOR.A_ENC_EMBD_POS,
875+
MODEL_TENSOR.A_ENC_CONV1D,
876+
MODEL_TENSOR.A_PRE_NORM,
877+
MODEL_TENSOR.A_POST_NORM,
878+
MODEL_TENSOR.A_ENC_ATTN_Q,
879+
MODEL_TENSOR.A_ENC_ATTN_K,
880+
MODEL_TENSOR.A_ENC_ATTN_V,
881+
MODEL_TENSOR.A_ENC_INPUT_NORM,
882+
MODEL_TENSOR.A_ENC_OUTPUT,
883+
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
884+
MODEL_TENSOR.A_ENC_FFN_UP,
885+
MODEL_TENSOR.A_ENC_FFN_GATE,
886+
MODEL_TENSOR.A_ENC_FFN_DOWN,
887+
MODEL_TENSOR.A_MMPROJ,
888+
MODEL_TENSOR.A_MM_NORM_PRE,
889+
MODEL_TENSOR.A_MM_NORM_MID,
822890
],
823891
MODEL_ARCH.LLAMA: [
824892
MODEL_TENSOR.TOKEN_EMBD,
@@ -2186,6 +2254,7 @@ class VisionProjectorType:
21862254
LLAMA4 = "llama4"
21872255
QWEN2VL = "qwen2vl_merger"
21882256
QWEN25VL = "qwen2.5vl_merger"
2257+
ULTRAVOX = "ultravox"
21892258
INTERNVL = "internvl"
21902259

21912260

gguf-py/gguf/gguf_writer.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -936,12 +936,18 @@ def add_eom_token_id(self, id: int) -> None:
936936

937937
# for vision models
938938

939+
def add_clip_has_vision_encoder(self, value: bool) -> None:
940+
self.add_bool(Keys.Clip.HAS_VISION_ENCODER, value)
941+
942+
def add_clip_has_audio_encoder(self, value: bool) -> None:
943+
self.add_bool(Keys.Clip.HAS_AUDIO_ENCODER, value)
944+
945+
def add_clip_projector_type(self, value: str) -> None:
946+
self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
947+
939948
def add_vision_projection_dim(self, value: int) -> None:
940949
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
941950

942-
def add_vision_has_vision_encoder(self, value: bool) -> None:
943-
self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
944-
945951
def add_vision_patch_size(self, value: int) -> None:
946952
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
947953

@@ -957,9 +963,6 @@ def add_vision_block_count(self, value: int) -> None:
957963
def add_vision_head_count(self, value: int) -> None:
958964
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
959965

960-
def add_vision_projector_type(self, value: str) -> None:
961-
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
962-
963966
def add_vision_attention_layernorm_eps(self, value: float) -> None:
964967
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
965968

@@ -987,6 +990,32 @@ def add_vision_projector_scale_factor(self, value: int) -> None:
987990
def add_vision_n_wa_pattern(self, value: int) -> None:
988991
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
989992

993+
# audio models
994+
995+
def add_audio_projection_dim(self, value: int) -> None:
996+
self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
997+
998+
def add_audio_embedding_length(self, value: int) -> None:
999+
self.add_uint32(Keys.ClipAudio.EMBEDDING_LENGTH, value)
1000+
1001+
def add_audio_feed_forward_length(self, value: int) -> None:
1002+
self.add_uint32(Keys.ClipAudio.FEED_FORWARD_LENGTH, value)
1003+
1004+
def add_audio_block_count(self, value: int) -> None:
1005+
self.add_uint32(Keys.ClipAudio.BLOCK_COUNT, value)
1006+
1007+
def add_audio_head_count(self, value: int) -> None:
1008+
self.add_uint32(Keys.ClipAudio.Attention.HEAD_COUNT, value)
1009+
1010+
def add_audio_attention_layernorm_eps(self, value: float) -> None:
1011+
self.add_float32(Keys.ClipAudio.Attention.LAYERNORM_EPS, value)
1012+
1013+
def add_audio_num_mel_bins(self, value: int) -> None:
1014+
self.add_uint32(Keys.ClipAudio.NUM_MEL_BINS, value)
1015+
1016+
def add_audio_stack_factor(self, value: int) -> None:
1017+
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
1018+
9901019
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
9911020
pack_prefix = ''
9921021
if not skip_pack_prefix:

gguf-py/gguf/tensor_mapping.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,6 +1110,68 @@ class TensorNameMap:
11101110
MODEL_TENSOR.V_MM_PATCH_MERGER: (
11111111
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
11121112
),
1113+
1114+
# audio (mtmd)
1115+
1116+
MODEL_TENSOR.A_ENC_EMBD_POS: (
1117+
"audio_tower.embed_positions", # ultravox
1118+
),
1119+
1120+
MODEL_TENSOR.A_ENC_CONV1D: (
1121+
"audio_tower.conv{bid}", # ultravox
1122+
),
1123+
1124+
MODEL_TENSOR.A_PRE_NORM: (),
1125+
1126+
MODEL_TENSOR.A_POST_NORM: (
1127+
"audio_tower.layer_norm", # ultravox
1128+
),
1129+
1130+
MODEL_TENSOR.A_ENC_ATTN_Q: (
1131+
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
1132+
),
1133+
1134+
MODEL_TENSOR.A_ENC_ATTN_K: (
1135+
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
1136+
),
1137+
1138+
MODEL_TENSOR.A_ENC_ATTN_V: (
1139+
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
1140+
),
1141+
1142+
MODEL_TENSOR.A_ENC_INPUT_NORM: (
1143+
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
1144+
),
1145+
1146+
MODEL_TENSOR.A_ENC_OUTPUT: (
1147+
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
1148+
),
1149+
1150+
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
1151+
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
1152+
),
1153+
1154+
MODEL_TENSOR.A_ENC_FFN_UP: (
1155+
"audio_tower.layers.{bid}.fc1", # ultravox
1156+
),
1157+
1158+
MODEL_TENSOR.A_ENC_FFN_GATE: (),
1159+
1160+
MODEL_TENSOR.A_ENC_FFN_DOWN: (
1161+
"audio_tower.layers.{bid}.fc2", # ultravox
1162+
),
1163+
1164+
MODEL_TENSOR.A_MMPROJ: (
1165+
"audio.multi_modal_projector.linear_{bid}", # ultravox
1166+
),
1167+
1168+
MODEL_TENSOR.A_MM_NORM_PRE: (
1169+
"audio.multi_modal_projector.ln_pre", # ultravox
1170+
),
1171+
1172+
MODEL_TENSOR.A_MM_NORM_MID: (
1173+
"audio.multi_modal_projector.ln_mid", # ultravox
1174+
),
11131175
}
11141176

11151177
# architecture-specific block mappings

0 commit comments

Comments
 (0)