forked from litongjava/whisper-cpp-server
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stream_components_service.cpp
128 lines (111 loc) · 4.03 KB
/
stream_components_service.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#include "stream_components_service.h"
using namespace stream_components;
// -- WhisperService --
WhisperService::WhisperService(const struct service_params &sparams,
const struct audio_params &aparams,
const struct whisper_context_params &cparams)
: sparams(sparams),
aparams(aparams),
ctx(whisper_init_from_file_with_params(sparams.model.c_str(), cparams))
{
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
sparams.n_threads * sparams.n_processors, std::thread::hardware_concurrency(),
whisper_print_system_info());
}
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx))
{
if (sparams.language != "en" || sparams.translate)
{
this->sparams.language = "en";
this->sparams.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n",
__func__);
}
}
fprintf(stderr, "%s: serving with %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
sparams.n_threads,
sparams.n_processors,
sparams.language.c_str(),
sparams.translate ? "translate" : "transcribe",
sparams.no_timestamps ? 0 : 1);
// if (!audio_params.use_vad) {
// fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, sparams.no_context);
// }
fprintf(stderr, "\n");
}
}
WhisperService::~WhisperService()
{
whisper_print_timings(ctx);
whisper_free(ctx);
}
bool WhisperService::process(const float *samples, int sample_count)
{
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_realtime = false;
wparams.print_timestamps = false;
wparams.print_special = true;
wparams.max_tokens = 0;
wparams.token_timestamps = true;
wparams.translate = sparams.translate;
wparams.single_segment = !aparams.use_vad;
wparams.language = sparams.language.c_str();
wparams.n_threads = sparams.n_threads;
wparams.audio_ctx = aparams.audio_ctx;
wparams.speed_up = sparams.speed_up;
wparams.tdrz_enable = sparams.tinydiarize; // [TDRZ]
// disable temperature fallback
// wparams.temperature_inc = -1.0f;
wparams.temperature_inc = sparams.no_fallback ? 0.0f : wparams.temperature_inc;
wparams.prompt_tokens = sparams.no_context ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = sparams.no_context ? 0 : prompt_tokens.size();
// *** Run the actual inference!!! ***
// if (whisper_full(ctx, wparams, samples, sample_count) != 0) {
// return false;
// }
// whisper_full_parallel
if (whisper_full_parallel(ctx, wparams, samples, sample_count, sparams.n_processors) != 0)
{
// error:ggml_metal_get_buffer: error: buffer is nil
return false;
}
// Now sure whether n_iter and n_new_line should have ever been there...
// *** SUSPICIOUS what happens by removing them? Are they essential?
// if (!use_vad && (n_iter % n_new_line) == 0) {
// if (!audio_params.use_vad) {
// printf("\n");
// keep part of the audio for next iteration to try to mitigate word boundary issues
// *** I don't know if we need this...
// pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
// Add tokens of the last full length segment as the prompt
// if (!sparams.no_context) {
// prompt_tokens.
//
// clear();
//
// const int n_segments = whisper_full_n_segments(ctx);
// for (
// int i = 0;
// i < n_segments;
// ++i) {
// const int token_count = whisper_full_n_tokens(ctx, i);
// for (
// int j = 0;
// j < token_count;
// ++j) {
// prompt_tokens.
// push_back(whisper_full_get_token_id(ctx, i, j)
// );
// }
// }
// }
// }
return true;
}