@@ -193,6 +193,15 @@ enum e_model {
193
193
MODEL_LARGE,
194
194
};
195
195
196
+ static const std::map<e_model, std::string> g_model_name = {
197
+ { MODEL_UNKNOWN, " unknown" },
198
+ { MODEL_TINY, " tiny" },
199
+ { MODEL_BASE, " base" },
200
+ { MODEL_SMALL, " small" },
201
+ { MODEL_MEDIUM, " medium" },
202
+ { MODEL_LARGE, " large" },
203
+ };
204
+
196
205
static const std::map<std::string, std::pair<int , std::string>> g_lang = {
197
206
{ " en" , { 0 , " english" , } },
198
207
{ " zh" , { 1 , " chinese" , } },
@@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
293
302
{ " ba" , { 96 , " bashkir" , } },
294
303
{ " jw" , { 97 , " javanese" , } },
295
304
{ " su" , { 98 , " sundanese" , } },
305
+ { " yue" , { 99 , " cantonese" , } },
296
306
};
297
307
298
308
static const size_t MB = 1ull *1024 *1024 ;
@@ -402,7 +412,11 @@ struct whisper_vocab {
402
412
id token_beg = 50363 ; // begin timestamps
403
413
404
414
bool is_multilingual () const {
405
- return n_vocab == 51865 ;
415
+ return n_vocab >= 51865 ;
416
+ }
417
+
418
+ int num_languages () const {
419
+ return n_vocab - 51765 - (is_multilingual () ? 1 : 0 );
406
420
}
407
421
};
408
422
@@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
922
936
923
937
assert (hparams.n_text_state == hparams.n_audio_state );
924
938
939
+ std::string mver = " " ;
940
+
925
941
if (hparams.n_audio_layer == 4 ) {
926
942
model.type = e_model::MODEL_TINY;
927
943
}
@@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
940
956
941
957
if (hparams.n_audio_layer == 32 ) {
942
958
model.type = e_model::MODEL_LARGE;
959
+
960
+ if (hparams.n_vocab == 51866 ) {
961
+ mver = " v3" ;
962
+ }
943
963
}
944
964
945
965
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
@@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
968
988
log (" %s: n_mels = %d\n " , __func__, hparams.n_mels );
969
989
log (" %s: ftype = %d\n " , __func__, model.hparams .ftype );
970
990
log (" %s: qntvr = %d\n " , __func__, qntvr);
971
- log (" %s: type = %d\n " , __func__, model.type );
991
+ log (" %s: type = %d (%s%s) \n " , __func__, model.type , g_model_name. at (model. type ). c_str (), mver. c_str () );
972
992
973
993
// print memory requirements
974
994
{
@@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1039
1059
if (vocab.is_multilingual ()) {
1040
1060
vocab.token_eot ++;
1041
1061
vocab.token_sot ++;
1042
- vocab.token_translate ++;
1043
- vocab.token_transcribe ++;
1044
- vocab.token_solm ++;
1045
- vocab.token_prev ++;
1046
- vocab.token_nosp ++;
1047
- vocab.token_not ++;
1048
- vocab.token_beg ++;
1062
+
1063
+ // account for variable number of language tokens
1064
+ const int dt = vocab.num_languages () - 98 ;
1065
+
1066
+ vocab.token_translate += dt;
1067
+ vocab.token_transcribe += dt;
1068
+ vocab.token_solm += dt;
1069
+ vocab.token_prev += dt;
1070
+ vocab.token_nosp += dt;
1071
+ vocab.token_not += dt;
1072
+ vocab.token_beg += dt;
1049
1073
}
1050
1074
1051
1075
if (n_vocab < model.hparams .n_vocab ) {
@@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1074
1098
vocab.id_to_token [i] = word;
1075
1099
}
1076
1100
}
1101
+
1102
+ log (" %s: n_langs = %d\n " , __func__, vocab.num_languages ());
1077
1103
}
1078
1104
1079
1105
size_t ctx_size = 0 ;
@@ -3281,7 +3307,7 @@ void whisper_free_params(struct whisper_full_params * params) {
3281
3307
}
3282
3308
3283
3309
int whisper_pcm_to_mel_with_state (struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3284
- if (!log_mel_spectrogram (*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL , n_threads, ctx->model .filters , false , state->mel )) {
3310
+ if (!log_mel_spectrogram (*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx-> model . filters . n_mel , n_threads, ctx->model .filters , false , state->mel )) {
3285
3311
log (" %s: failed to compute mel spectrogram\n " , __func__);
3286
3312
return -1 ;
3287
3313
}
@@ -3295,7 +3321,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
3295
3321
3296
3322
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
3297
3323
int whisper_pcm_to_mel_phase_vocoder_with_state (struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3298
- if (!log_mel_spectrogram (*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL , n_threads, ctx->model .filters , false , state->mel )) {
3324
+ if (!log_mel_spectrogram (*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx-> model . filters . n_mel , n_threads, ctx->model .filters , false , state->mel )) {
3299
3325
log (" %s: failed to compute mel spectrogram\n " , __func__);
3300
3326
return -1 ;
3301
3327
}
@@ -3318,13 +3344,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
3318
3344
// TODO
3319
3345
3320
3346
int whisper_set_mel_with_state (
3321
- struct whisper_context * /* ctx*/ ,
3347
+ struct whisper_context * ctx,
3322
3348
struct whisper_state * state,
3323
3349
const float * data,
3324
3350
int n_len,
3325
3351
int n_mel) {
3326
- if (n_mel != WHISPER_N_MEL ) {
3327
- log (" %s: invalid number of mel bands: %d (expected %d)\n " , __func__, n_mel, WHISPER_N_MEL );
3352
+ if (n_mel != ctx-> model . filters . n_mel ) {
3353
+ log (" %s: invalid number of mel bands: %d (expected %d)\n " , __func__, n_mel, ctx-> model . filters . n_mel );
3328
3354
return -1 ;
3329
3355
}
3330
3356
0 commit comments