Skip to content

Commit 8fcd1a3

Browse files
tuxpoldoggerganov
andauthored
main : provide option for creating JSON output (ggml-org#615)
* examples : provide option for exporting also as JSON file (ggml-org#614) * main : remove leftovers --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 992aa2c commit 8fcd1a3

File tree

4 files changed

+214
-1
lines changed

4 files changed

+214
-1
lines changed

examples/main/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ options:
3131
-osrt, --output-srt [false ] output result in a srt file
3232
-owts, --output-words [false ] output script for generating karaoke video
3333
-ocsv, --output-csv [false ] output result in a CSV file
34+
-oj, --output-json [false ] output result in a JSON file
3435
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
3536
-ps, --print-special [false ] print special tokens
3637
-pc, --print-colors [false ] print colors

examples/main/main.cpp

+132
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct whisper_params {
7373
bool output_srt = false;
7474
bool output_wts = false;
7575
bool output_csv = false;
76+
bool output_jsn = false;
7677
bool print_special = false;
7778
bool print_colors = false;
7879
bool print_progress = false;
@@ -130,6 +131,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
130131
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
131132
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
132133
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
134+
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
133135
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
134136
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
135137
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
@@ -178,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
178180
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
179181
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
180182
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
183+
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
181184
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
182185
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
183186
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
@@ -368,6 +371,129 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
368371
return true;
369372
}
370373

374+
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
375+
std::ofstream fout(fname);
376+
int indent = 0;
377+
378+
auto doindent = [&]() {
379+
for (int i = 0; i < indent; i++) fout << "\t";
380+
};
381+
382+
auto start_arr = [&](const char *name) {
383+
doindent();
384+
fout << "\"" << name << "\": [\n";
385+
indent++;
386+
};
387+
388+
auto end_arr = [&](bool end = false) {
389+
indent--;
390+
doindent();
391+
fout << (end ? "]\n" : "},\n");
392+
};
393+
394+
auto start_obj = [&](const char *name = nullptr) {
395+
doindent();
396+
if (name) {
397+
fout << "\"" << name << "\": {\n";
398+
} else {
399+
fout << "{\n";
400+
}
401+
indent++;
402+
};
403+
404+
auto end_obj = [&](bool end = false) {
405+
indent--;
406+
doindent();
407+
fout << (end ? "}\n" : "},\n");
408+
};
409+
410+
auto start_value = [&](const char *name) {
411+
doindent();
412+
fout << "\"" << name << "\": ";
413+
};
414+
415+
auto value_s = [&](const char *name, const char *val, bool end = false) {
416+
start_value(name);
417+
fout << "\"" << val << (end ? "\"\n" : "\",\n");
418+
};
419+
420+
auto end_value = [&](bool end = false) {
421+
fout << (end ? "\n" : ",\n");
422+
};
423+
424+
auto value_i = [&](const char *name, const int64_t val, bool end = false) {
425+
start_value(name);
426+
fout << val;
427+
end_value(end);
428+
};
429+
430+
auto value_b = [&](const char *name, const bool val, bool end = false) {
431+
start_value(name);
432+
fout << (val ? "true" : "false");
433+
end_value(end);
434+
};
435+
436+
if (!fout.is_open()) {
437+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
438+
return false;
439+
}
440+
441+
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
442+
start_obj();
443+
value_s("systeminfo", whisper_print_system_info());
444+
start_obj("model");
445+
value_s("type", whisper_model_type_readable(ctx));
446+
value_b("multilingual", whisper_is_multilingual(ctx));
447+
value_i("vocab", whisper_model_n_vocab(ctx));
448+
start_obj("audio");
449+
value_i("ctx", whisper_model_n_audio_ctx(ctx));
450+
value_i("state", whisper_model_n_audio_state(ctx));
451+
value_i("head", whisper_model_n_audio_head(ctx));
452+
value_i("layer", whisper_model_n_audio_layer(ctx), true);
453+
end_obj();
454+
start_obj("text");
455+
value_i("ctx", whisper_model_n_text_ctx(ctx));
456+
value_i("state", whisper_model_n_text_state(ctx));
457+
value_i("head", whisper_model_n_text_head(ctx));
458+
value_i("leyer", whisper_model_n_text_layer(ctx), true);
459+
end_obj();
460+
value_i("mels", whisper_model_n_mels(ctx));
461+
value_i("f16", whisper_model_f16(ctx), true);
462+
end_obj();
463+
start_obj("params");
464+
value_s("model", params.model.c_str());
465+
value_s("language", params.language.c_str());
466+
value_b("translate", params.translate, true);
467+
end_obj();
468+
start_obj("result");
469+
value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
470+
end_obj();
471+
start_arr("transcription");
472+
473+
const int n_segments = whisper_full_n_segments(ctx);
474+
for (int i = 0; i < n_segments; ++i) {
475+
const char * text = whisper_full_get_segment_text(ctx, i);
476+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
477+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
478+
479+
start_obj();
480+
start_obj("timestanps");
481+
value_s("from", to_timestamp(t0, true).c_str());
482+
value_s("to", to_timestamp(t1, true).c_str(), true);
483+
end_obj();
484+
start_obj("offsets");
485+
value_i("from", t0 * 10);
486+
value_i("to", t1 * 10, true);
487+
end_obj();
488+
value_s("text", text, true);
489+
end_obj(i == (n_segments - 1));
490+
}
491+
492+
end_arr(true);
493+
end_obj(true);
494+
return true;
495+
}
496+
371497
// karaoke video generation
372498
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
373499
// TODO: font parameter adjustments
@@ -662,6 +788,12 @@ int main(int argc, char ** argv) {
662788
const auto fname_csv = fname_out + ".csv";
663789
output_csv(ctx, fname_csv.c_str());
664790
}
791+
792+
// output to JSON file
793+
if (params.output_jsn) {
794+
const auto fname_jsn = fname_out + ".json";
795+
output_json(ctx, fname_jsn.c_str(), params);
796+
}
665797
}
666798
}
667799

whisper.cpp

+66-1
Original file line numberDiff line numberDiff line change
@@ -1408,7 +1408,7 @@ static bool whisper_encode_internal(
14081408
//}
14091409

14101410
static int iter = 0;
1411-
1411+
14121412
const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
14131413
const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
14141414

@@ -2919,6 +2919,71 @@ int whisper_lang_auto_detect(
29192919
return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
29202920
}
29212921

2922+
int whisper_model_n_vocab(struct whisper_context * ctx) {
2923+
return ctx->model.hparams.n_vocab;
2924+
}
2925+
2926+
int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
2927+
return ctx->model.hparams.n_audio_ctx;
2928+
}
2929+
2930+
int whisper_model_n_audio_state(struct whisper_context * ctx) {
2931+
return ctx->model.hparams.n_audio_state;
2932+
}
2933+
2934+
int whisper_model_n_audio_head(struct whisper_context * ctx) {
2935+
return ctx->model.hparams.n_audio_head;
2936+
}
2937+
2938+
int whisper_model_n_audio_layer(struct whisper_context * ctx) {
2939+
return ctx->model.hparams.n_audio_layer;
2940+
}
2941+
2942+
int whisper_model_n_text_ctx(struct whisper_context * ctx) {
2943+
return ctx->model.hparams.n_text_ctx;
2944+
}
2945+
2946+
int whisper_model_n_text_state(struct whisper_context * ctx) {
2947+
return ctx->model.hparams.n_text_state;
2948+
}
2949+
2950+
int whisper_model_n_text_head(struct whisper_context * ctx) {
2951+
return ctx->model.hparams.n_text_head;
2952+
}
2953+
2954+
int whisper_model_n_text_layer(struct whisper_context * ctx) {
2955+
return ctx->model.hparams.n_text_layer;
2956+
}
2957+
2958+
int whisper_model_n_mels(struct whisper_context * ctx) {
2959+
return ctx->model.hparams.n_mels;
2960+
}
2961+
2962+
int whisper_model_f16(struct whisper_context * ctx) {
2963+
return ctx->model.hparams.f16;
2964+
}
2965+
2966+
int whisper_model_type(struct whisper_context * ctx) {
2967+
return ctx->model.type;
2968+
}
2969+
2970+
const char *whisper_model_type_readable(struct whisper_context * ctx) {
2971+
switch (ctx->model.type) {
2972+
case e_model::MODEL_TINY:
2973+
return "tiny";
2974+
case e_model::MODEL_BASE:
2975+
return "base";
2976+
case e_model::MODEL_SMALL:
2977+
return "small";
2978+
case e_model::MODEL_MEDIUM:
2979+
return "medium";
2980+
case e_model::MODEL_LARGE:
2981+
return "large";
2982+
default:
2983+
return "unknown";
2984+
}
2985+
}
2986+
29222987
int whisper_n_len_from_state(struct whisper_state * state) {
29232988
return state->mel.n_len;
29242989
}

whisper.h

+15
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,19 @@ extern "C" {
248248
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
249249
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
250250

251+
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
252+
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
253+
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
254+
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
255+
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
256+
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
257+
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
258+
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
259+
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
260+
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
261+
WHISPER_API int whisper_model_f16 (struct whisper_context * ctx);
262+
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
263+
251264
// Token logits obtained from the last call to whisper_decode()
252265
// The logits for the last token are stored in the last row
253266
// Rows: n_tokens
@@ -257,6 +270,8 @@ extern "C" {
257270

258271
// Token Id -> String. Uses the vocabulary in the provided context
259272
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
273+
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
274+
260275

261276
// Special tokens
262277
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);

0 commit comments

Comments
 (0)