-
Notifications
You must be signed in to change notification settings - Fork 423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vad patch #1369
base: master
Are you sure you want to change the base?
Vad patch #1369
Changes from 5 commits
e1106cf
c6f332e
5a7f0a7
6ddbdee
b68424d
a7f1a7b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,8 +31,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) { | |
po->Register( | ||
"silero-vad-max-speech-duration", &max_speech_duration, | ||
"In seconds. If a speech segment is longer than this value, then we " | ||
"increase the threshold to 0.9. After finishing detecting the segment, " | ||
"the threshold value is reset to its original value."); | ||
"cut a segment."); | ||
|
||
po->Register( | ||
"silero-vad-window-size", &window_size, | ||
|
@@ -102,12 +101,12 @@ bool SileroVadModelConfig::Validate() const { | |
std::string SileroVadModelConfig::ToString() const { | ||
std::ostringstream os; | ||
|
||
os << "SileroVadModelConfig("; | ||
os << "SilerVadModelConfig("; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please don't change it. |
||
os << "model=\"" << model << "\", "; | ||
os << "threshold=" << threshold << ", "; | ||
os << "min_silence_duration=" << min_silence_duration << ", "; | ||
os << "min_speech_duration=" << min_speech_duration << ", "; | ||
os << "max_speech_duration=" << max_speech_duration << ", "; | ||
os << "max_speech_duration=" << max_speech_duration << ", "; | ||
os << "window_size=" << window_size << ")"; | ||
|
||
return os.str(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,10 +27,7 @@ struct SileroVadModelConfig { | |
// 256, 512, 768 samples for 800 Hz | ||
int32_t window_size = 512; // in samples | ||
|
||
// If a speech segment is longer than this value, then we increase | ||
// the threshold to 0.9. After finishing detecting the segment, | ||
// the threshold value is reset to its original value. | ||
float max_speech_duration = 20; // in seconds | ||
float max_speech_duration = 20; // in seconds | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please don't remove the comments. |
||
|
||
SileroVadModelConfig() = default; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
#include "sherpa-onnx/csrc/macros.h" | ||
#include "sherpa-onnx/csrc/onnx-utils.h" | ||
#include "sherpa-onnx/csrc/session.h" | ||
#include "silero-vad-model.h" | ||
|
||
namespace sherpa_onnx { | ||
|
||
|
@@ -32,9 +33,13 @@ class SileroVadModel::Impl { | |
} | ||
|
||
min_silence_samples_ = | ||
sample_rate_ * config_.silero_vad.min_silence_duration; | ||
(int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration); | ||
|
||
min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration; | ||
min_speech_samples_ = | ||
(int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration); | ||
|
||
max_speech_samples_ = | ||
(int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration); | ||
Comment on lines
35
to
+42
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason to make such changes? |
||
} | ||
|
||
#if __ANDROID_API__ >= 9 | ||
|
@@ -54,9 +59,13 @@ class SileroVadModel::Impl { | |
} | ||
|
||
min_silence_samples_ = | ||
sample_rate_ * config_.silero_vad.min_silence_duration; | ||
(int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration); | ||
|
||
min_speech_samples_ = | ||
(int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration); | ||
|
||
min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration; | ||
max_speech_samples_ = | ||
(int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration); | ||
} | ||
#endif | ||
|
||
|
@@ -155,14 +164,34 @@ class SileroVadModel::Impl { | |
|
||
int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } | ||
|
||
int32_t MaxSpeechDurationSamples() const { return max_speech_samples_; } | ||
|
||
float Threshold() { return config_.silero_vad.threshold; } | ||
|
||
void SetMinSilenceDuration(float s) { | ||
min_silence_samples_ = sample_rate_ * s; | ||
min_silence_samples_ = (int32_t)(sample_rate_ * s); | ||
} | ||
|
||
void SetMinSpeechDuration(float s) { | ||
min_speech_samples_ = (int32_t)(sample_rate_ * s); | ||
} | ||
|
||
void SetMaxSpeechDuration(float s) { | ||
max_speech_samples_ = (int32_t)(sample_rate_ * s); | ||
} | ||
|
||
void SetThreshold(float threshold) { | ||
config_.silero_vad.threshold = threshold; | ||
} | ||
|
||
float Run(const float *samples, int32_t n) { | ||
if (is_v5_) { | ||
return RunV5(samples, n); | ||
} else { | ||
return RunV4(samples, n); | ||
} | ||
} | ||
|
||
private: | ||
void Init(void *model_data, size_t model_data_length) { | ||
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
|
@@ -335,14 +364,6 @@ class SileroVadModel::Impl { | |
} | ||
} | ||
|
||
float Run(const float *samples, int32_t n) { | ||
if (is_v5_) { | ||
return RunV5(samples, n); | ||
} else { | ||
return RunV4(samples, n); | ||
} | ||
} | ||
|
||
float RunV5(const float *samples, int32_t n) { | ||
auto memory_info = | ||
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
|
@@ -418,6 +439,7 @@ class SileroVadModel::Impl { | |
int64_t sample_rate_; | ||
int32_t min_silence_samples_; | ||
int32_t min_speech_samples_; | ||
int32_t max_speech_samples_; | ||
|
||
bool triggered_ = false; | ||
int32_t current_sample_ = 0; | ||
|
@@ -457,12 +479,30 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const { | |
return impl_->MinSpeechDurationSamples(); | ||
} | ||
|
||
int32_t SileroVadModel::MaxSpeechDurationSamples() { | ||
return impl_->MaxSpeechDurationSamples(); | ||
} | ||
|
||
float SileroVadModel::Threshold() { return impl_->Threshold(); } | ||
|
||
void SileroVadModel::SetMinSilenceDuration(float s) { | ||
impl_->SetMinSilenceDuration(s); | ||
} | ||
|
||
void SileroVadModel::SetMinSpeechDuration(float s) { | ||
impl_->SetMinSpeechDuration(s); | ||
} | ||
|
||
void SileroVadModel::SetThreshold(float threshold) { | ||
impl_->SetThreshold(threshold); | ||
} | ||
|
||
void SileroVadModel::SetMaxSpeechDuration(float s) { | ||
impl_->SetMaxSpeechDuration(s); | ||
} | ||
|
||
float SileroVadModel::Run(const float *samples, int32_t n) { | ||
return impl_->Run(samples, n); | ||
} | ||
|
||
} // namespace sherpa_onnx |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please don't remove it.