forked from Kitt-AI/snowboy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnowboy-detect.h
226 lines (196 loc) · 9.34 KB
/
snowboy-detect.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
// include/snowboy-detect.h
// Copyright 2016 KITT.AI (author: Guoguo Chen)
#ifndef SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
#define SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_
#include <memory>
#include <string>
namespace snowboy {
// Forward declaration.
struct WaveHeader;
class PipelineDetect;
class PipelineVad;
////////////////////////////////////////////////////////////////////////////////
//
// SnowboyDetect class interface.
//
////////////////////////////////////////////////////////////////////////////////
class SnowboyDetect {
public:
// Constructor that takes a resource file, and a list of hotword models which
// are separated by comma. In the case that more than one hotword exist in the
// provided models, RunDetection() will return the index of the hotword, if
// the corresponding hotword is triggered.
//
// CAVEAT: a personal model only contain one hotword, but an universal model
// may contain multiple hotwords. It is your responsibility to figure
// out the index of the hotword. For example, if your model string is
// "foo.pmdl,bar.umdl", where foo.pmdl contains hotword x, bar.umdl
// has two hotwords y and z, the indices of different hotwords are as
// follows:
// x 1
// y 2
// z 3
//
// @param [in] resource_filename Filename of resource file.
// @param [in] model_str A string of multiple hotword models,
// separated by comma.
SnowboyDetect(const std::string& resource_filename,
const std::string& model_str);
// Resets the detection. This class handles voice activity detection (VAD)
// internally. But if you have an external VAD, you should call Reset()
// whenever you see segment end from your VAD.
bool Reset();
// Runs hotword detection. Supported audio format is WAVE (with linear PCM,
// 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
// See SampleRate(), NumChannels() and BitsPerSample() for the required
// sampling rate, number of channels and bits per sample values. You are
// supposed to provide a small chunk of data (e.g., 0.1 second) each time you
// call RunDetection(). Larger chunk usually leads to longer delay, but less
// CPU usage.
//
// Definition of return values:
// -2: Silence.
// -1: Error.
// 0: No event.
// 1: Hotword 1 triggered.
// 2: Hotword 2 triggered.
// ...
//
// @param [in] data Small chunk of data to be detected. See
// above for the supported data format.
// @param [in] is_end Set it to true if it is the end of a
// utterance or file.
int RunDetection(const std::string& data, bool is_end = false);
// Various versions of RunDetection() that take different format of audio. If
// NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
//
// d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
//
// where d1c1 means data point 1 of channel 1.
//
// @param [in] data Small chunk of data to be detected. See
// above for the supported data format.
// @param [in] array_length Length of the data array.
// @param [in] is_end Set it to true if it is the end of a
// utterance or file.
int RunDetection(const float* const data,
const int array_length, bool is_end = false);
int RunDetection(const int16_t* const data,
const int array_length, bool is_end = false);
int RunDetection(const int32_t* const data,
const int array_length, bool is_end = false);
// Sets the sensitivity string for the loaded hotwords. A <sensitivity_str> is
// a list of floating numbers between 0 and 1, and separated by comma. For
// example, if there are 3 loaded hotwords, your string should looks something
// like this:
// 0.4,0.5,0.8
// Make sure you properly align the sensitivity value to the corresponding
// hotword.
void SetSensitivity(const std::string& sensitivity_str);
// Similar to the sensitivity setting above. When set higher than the above
// sensitivity, the algorithm automatically chooses between the normal
// sensitivity set above and the higher sensitivity set here, to maximize the
// performance. By default, it is not set, which means the algorithm will
// stick with the sensitivity set above.
void SetHighSensitivity(const std::string& high_sensitivity_str);
// Returns the sensitivity string for the current hotwords.
std::string GetSensitivity() const;
// Applied a fixed gain to the input audio. In case you have a very weak
// microphone, you can use this function to boost input audio level.
void SetAudioGain(const float audio_gain);
// Writes the models to the model filenames specified in <model_str> in the
// constructor. This overwrites the original model with the latest parameter
// setting. You are supposed to call this function if you have updated the
// hotword sensitivities through SetSensitivity(), and you would like to store
// those values in the model as the default value.
void UpdateModel() const;
// Returns the number of the loaded hotwords. This helps you to figure the
// index of the hotwords.
int NumHotwords() const;
// If <apply_frontend> is true, then apply frontend audio processing;
// otherwise turns the audio processing off. Frontend audio processing
// includes algorithms such as automatic gain control (AGC), noise suppression
// (NS) and so on. Generally adding frontend audio processing helps the
// performance, but if the model is not trained with frontend audio
// processing, it may decrease the performance. The general rule of thumb is:
// 1. For personal models, set it to false.
// 2. For universal models, follow the instruction of each published model
void ApplyFrontend(const bool apply_frontend);
// Returns the required sampling rate, number of channels and bits per sample
// values for the audio data. You should use this information to set up your
// audio capturing interface.
int SampleRate() const;
int NumChannels() const;
int BitsPerSample() const;
~SnowboyDetect();
private:
std::unique_ptr<WaveHeader> wave_header_;
std::unique_ptr<PipelineDetect> detect_pipeline_;
};
////////////////////////////////////////////////////////////////////////////////
//
// SnowboyVad class interface.
//
////////////////////////////////////////////////////////////////////////////////
class SnowboyVad {
public:
// Constructor that takes a resource file. It shares the same resource file
// with SnowboyDetect.
SnowboyVad(const std::string& resource_filename);
// Resets the VAD.
bool Reset();
// Runs the VAD algorithm. Supported audio format is WAVE (with linear PCM,
// 8-bits unsigned integer, 16-bits signed integer or 32-bits signed integer).
// See SampleRate(), NumChannels() and BitsPerSample() for the required
// sampling rate, number of channels and bits per sample values. You are
// supposed to provide a small chunk of data (e.g., 0.1 second) each time you
// call RunDetection(). Larger chunk usually leads to longer delay, but less
// CPU usage.
//
// Definition of return values:
// -2: Silence.
// -1: Error.
// 0: Non-silence.
//
// @param [in] data Small chunk of data to be detected. See
// above for the supported data format.
// @param [in] is_end Set it to true if it is the end of a
// utterance or file.
int RunVad(const std::string& data, bool is_end = false);
// Various versions of RunVad() that take different format of audio. If
// NumChannels() > 1, e.g., NumChannels() == 2, then the array is as follows:
//
// d1c1, d1c2, d2c1, d2c2, d3c1, d3c2, ..., dNc1, dNc2
//
// where d1c1 means data point 1 of channel 1.
//
// @param [in] data Small chunk of data to be detected. See
// above for the supported data format.
// @param [in] array_length Length of the data array.
// @param [in] is_end Set it to true if it is the end of a
// utterance or file.
int RunVad(const float* const data,
const int array_length, bool is_end = false);
int RunVad(const int16_t* const data,
const int array_length, bool is_end = false);
int RunVad(const int32_t* const data,
const int array_length, bool is_end = false);
// Applied a fixed gain to the input audio. In case you have a very weak
// microphone, you can use this function to boost input audio level.
void SetAudioGain(const float audio_gain);
// If <apply_frontend> is true, then apply frontend audio processing;
// otherwise turns the audio processing off.
void ApplyFrontend(const bool apply_frontend);
// Returns the required sampling rate, number of channels and bits per sample
// values for the audio data. You should use this information to set up your
// audio capturing interface.
int SampleRate() const;
int NumChannels() const;
int BitsPerSample() const;
~SnowboyVad();
private:
std::unique_ptr<WaveHeader> wave_header_;
std::unique_ptr<PipelineVad> vad_pipeline_;
};
} // namespace snowboy
#endif // SNOWBOY_INCLUDE_SNOWBOY_DETECT_H_