@@ -73,6 +73,7 @@ struct whisper_params {
73
73
bool output_srt = false ;
74
74
bool output_wts = false ;
75
75
bool output_csv = false ;
76
+ bool output_jsn = false ;
76
77
bool print_special = false ;
77
78
bool print_colors = false ;
78
79
bool print_progress = false ;
@@ -130,6 +131,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
130
131
else if (arg == " -owts" || arg == " --output-words" ) { params.output_wts = true ; }
131
132
else if (arg == " -fp" || arg == " --font-path" ) { params.font_path = argv[++i]; }
132
133
else if (arg == " -ocsv" || arg == " --output-csv" ) { params.output_csv = true ; }
134
+ else if (arg == " -oj" || arg == " --output-json" ) { params.output_jsn = true ; }
133
135
else if (arg == " -of" || arg == " --output-file" ) { params.fname_out .emplace_back (argv[++i]); }
134
136
else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
135
137
else if (arg == " -pc" || arg == " --print-colors" ) { params.print_colors = true ; }
@@ -178,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
178
180
fprintf (stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n " , params.output_wts ? " true" : " false" );
179
181
fprintf (stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n " , params.font_path .c_str ());
180
182
fprintf (stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n " , params.output_csv ? " true" : " false" );
183
+ fprintf (stderr, " -oj, --output-json [%-7s] output result in a JSON file\n " , params.output_jsn ? " true" : " false" );
181
184
fprintf (stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n " , " " );
182
185
fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
183
186
fprintf (stderr, " -pc, --print-colors [%-7s] print colors\n " , params.print_colors ? " true" : " false" );
@@ -368,6 +371,129 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
368
371
return true ;
369
372
}
370
373
374
+ bool output_json (struct whisper_context * ctx, const char * fname, const whisper_params & params) {
375
+ std::ofstream fout (fname);
376
+ int indent = 0 ;
377
+
378
+ auto doindent = [&]() {
379
+ for (int i = 0 ; i < indent; i++) fout << " \t " ;
380
+ };
381
+
382
+ auto start_arr = [&](const char *name) {
383
+ doindent ();
384
+ fout << " \" " << name << " \" : [\n " ;
385
+ indent++;
386
+ };
387
+
388
+ auto end_arr = [&](bool end = false ) {
389
+ indent--;
390
+ doindent ();
391
+ fout << (end ? " ]\n " : " },\n " );
392
+ };
393
+
394
+ auto start_obj = [&](const char *name = nullptr ) {
395
+ doindent ();
396
+ if (name) {
397
+ fout << " \" " << name << " \" : {\n " ;
398
+ } else {
399
+ fout << " {\n " ;
400
+ }
401
+ indent++;
402
+ };
403
+
404
+ auto end_obj = [&](bool end = false ) {
405
+ indent--;
406
+ doindent ();
407
+ fout << (end ? " }\n " : " },\n " );
408
+ };
409
+
410
+ auto start_value = [&](const char *name) {
411
+ doindent ();
412
+ fout << " \" " << name << " \" : " ;
413
+ };
414
+
415
+ auto value_s = [&](const char *name, const char *val, bool end = false ) {
416
+ start_value (name);
417
+ fout << " \" " << val << (end ? " \"\n " : " \" ,\n " );
418
+ };
419
+
420
+ auto end_value = [&](bool end = false ) {
421
+ fout << (end ? " \n " : " ,\n " );
422
+ };
423
+
424
+ auto value_i = [&](const char *name, const int64_t val, bool end = false ) {
425
+ start_value (name);
426
+ fout << val;
427
+ end_value (end);
428
+ };
429
+
430
+ auto value_b = [&](const char *name, const bool val, bool end = false ) {
431
+ start_value (name);
432
+ fout << (val ? " true" : " false" );
433
+ end_value (end);
434
+ };
435
+
436
+ if (!fout.is_open ()) {
437
+ fprintf (stderr, " %s: failed to open '%s' for writing\n " , __func__, fname);
438
+ return false ;
439
+ }
440
+
441
+ fprintf (stderr, " %s: saving output to '%s'\n " , __func__, fname);
442
+ start_obj ();
443
+ value_s (" systeminfo" , whisper_print_system_info ());
444
+ start_obj (" model" );
445
+ value_s (" type" , whisper_model_type_readable (ctx));
446
+ value_b (" multilingual" , whisper_is_multilingual (ctx));
447
+ value_i (" vocab" , whisper_model_n_vocab (ctx));
448
+ start_obj (" audio" );
449
+ value_i (" ctx" , whisper_model_n_audio_ctx (ctx));
450
+ value_i (" state" , whisper_model_n_audio_state (ctx));
451
+ value_i (" head" , whisper_model_n_audio_head (ctx));
452
+ value_i (" layer" , whisper_model_n_audio_layer (ctx), true );
453
+ end_obj ();
454
+ start_obj (" text" );
455
+ value_i (" ctx" , whisper_model_n_text_ctx (ctx));
456
+ value_i (" state" , whisper_model_n_text_state (ctx));
457
+ value_i (" head" , whisper_model_n_text_head (ctx));
458
+ value_i (" leyer" , whisper_model_n_text_layer (ctx), true );
459
+ end_obj ();
460
+ value_i (" mels" , whisper_model_n_mels (ctx));
461
+ value_i (" f16" , whisper_model_f16 (ctx), true );
462
+ end_obj ();
463
+ start_obj (" params" );
464
+ value_s (" model" , params.model .c_str ());
465
+ value_s (" language" , params.language .c_str ());
466
+ value_b (" translate" , params.translate , true );
467
+ end_obj ();
468
+ start_obj (" result" );
469
+ value_s (" language" , whisper_lang_str (whisper_full_lang_id (ctx)), true );
470
+ end_obj ();
471
+ start_arr (" transcription" );
472
+
473
+ const int n_segments = whisper_full_n_segments (ctx);
474
+ for (int i = 0 ; i < n_segments; ++i) {
475
+ const char * text = whisper_full_get_segment_text (ctx, i);
476
+ const int64_t t0 = whisper_full_get_segment_t0 (ctx, i);
477
+ const int64_t t1 = whisper_full_get_segment_t1 (ctx, i);
478
+
479
+ start_obj ();
480
+ start_obj (" timestanps" );
481
+ value_s (" from" , to_timestamp (t0, true ).c_str ());
482
+ value_s (" to" , to_timestamp (t1, true ).c_str (), true );
483
+ end_obj ();
484
+ start_obj (" offsets" );
485
+ value_i (" from" , t0 * 10 );
486
+ value_i (" to" , t1 * 10 , true );
487
+ end_obj ();
488
+ value_s (" text" , text, true );
489
+ end_obj (i == (n_segments - 1 ));
490
+ }
491
+
492
+ end_arr (true );
493
+ end_obj (true );
494
+ return true ;
495
+ }
496
+
371
497
// karaoke video generation
372
498
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
373
499
// TODO: font parameter adjustments
@@ -662,6 +788,12 @@ int main(int argc, char ** argv) {
662
788
const auto fname_csv = fname_out + " .csv" ;
663
789
output_csv (ctx, fname_csv.c_str ());
664
790
}
791
+
792
+ // output to JSON file
793
+ if (params.output_jsn ) {
794
+ const auto fname_jsn = fname_out + " .json" ;
795
+ output_json (ctx, fname_jsn.c_str (), params);
796
+ }
665
797
}
666
798
}
667
799
0 commit comments