k2-fsa · laochen · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 26, 2024
diff --git a/sherpa-onnx/csrc/silero-vad-model-config.cc b/sherpa-onnx/csrc/silero-vad-model-config.cc
@@ -31,8 +31,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
   po->Register(
       "silero-vad-max-speech-duration", &max_speech_duration,
       "In seconds. If a speech segment is longer than this value, then we "
-      "increase the threshold to 0.9. After finishing detecting the segment, "
-      "the threshold value is reset to its original value.");
+      "cut a segment.");  
 
   po->Register(
       "silero-vad-window-size", &window_size,
@@ -102,12 +101,12 @@ bool SileroVadModelConfig::Validate() const {
 std::string SileroVadModelConfig::ToString() const {
   std::ostringstream os;
 
-  os << "SileroVadModelConfig(";
+  os << "SilerVadModelConfig(";
   os << "model=\"" << model << "\", ";
   os << "threshold=" << threshold << ", ";
   os << "min_silence_duration=" << min_silence_duration << ", ";
   os << "min_speech_duration=" << min_speech_duration << ", ";
-  os << "max_speech_duration=" << max_speech_duration << ", ";
+  os << "max_speech_duration=" << max_speech_duration << ", ";  
   os << "window_size=" << window_size << ")";
 
   return os.str();

diff --git a/sherpa-onnx/csrc/silero-vad-model-config.h b/sherpa-onnx/csrc/silero-vad-model-config.h
@@ -27,10 +27,7 @@ struct SileroVadModelConfig {
   // 256, 512, 768 samples for 800 Hz
   int32_t window_size = 512;  // in samples
 
-  // If a speech segment is longer than this value, then we increase
-  // the threshold to 0.9. After finishing detecting the segment,
-  // the threshold value is reset to its original value.
-  float max_speech_duration = 20;  // in seconds
+  float max_speech_duration = 20;  // in seconds  
 
   SileroVadModelConfig() = default;
 

diff --git a/sherpa-onnx/csrc/silero-vad-model.cc b/sherpa-onnx/csrc/silero-vad-model.cc
@@ -11,6 +11,7 @@
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/onnx-utils.h"
 #include "sherpa-onnx/csrc/session.h"
+#include "silero-vad-model.h"
 
 namespace sherpa_onnx {
 
@@ -32,9 +33,13 @@ class SileroVadModel::Impl {
     }
 
     min_silence_samples_ =
-        sample_rate_ * config_.silero_vad.min_silence_duration;
+        (int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration);
 
-    min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
+    min_speech_samples_ =
+        (int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration);
+
+    max_speech_samples_ =
+        (int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration);
   }
 
 #if __ANDROID_API__ >= 9
@@ -54,9 +59,13 @@ class SileroVadModel::Impl {
     }
 
     min_silence_samples_ =
-        sample_rate_ * config_.silero_vad.min_silence_duration;
+        (int32_t)(sample_rate_ * config_.silero_vad.min_silence_duration);
+
+    min_speech_samples_ =
+        (int32_t)(sample_rate_ * config_.silero_vad.min_speech_duration);
 
-    min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
+    max_speech_samples_ =
+        (int32_t)(sample_rate_ * config_.silero_vad.max_speech_duration);
   }
 #endif
 
@@ -155,14 +164,34 @@ class SileroVadModel::Impl {
 
   int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
 
+  int32_t MaxSpeechDurationSamples() const { return max_speech_samples_; }
+
+  float Threshold() { return config_.silero_vad.threshold; }
+
   void SetMinSilenceDuration(float s) {
-    min_silence_samples_ = sample_rate_ * s;
+    min_silence_samples_ = (int32_t)(sample_rate_ * s);
+  }
+
+  void SetMinSpeechDuration(float s) {
+    min_speech_samples_ = (int32_t)(sample_rate_ * s);
+  }
+
+  void SetMaxSpeechDuration(float s) {
+    max_speech_samples_ = (int32_t)(sample_rate_ * s);
   }
 
   void SetThreshold(float threshold) {
     config_.silero_vad.threshold = threshold;
   }
 
+  float Run(const float *samples, int32_t n) {
+    if (is_v5_) {
+      return RunV5(samples, n);
+    } else {
+      return RunV4(samples, n);
+    }
+  }
+
  private:
   void Init(void *model_data, size_t model_data_length) {
     sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
@@ -335,14 +364,6 @@ class SileroVadModel::Impl {
     }
   }
 
-  float Run(const float *samples, int32_t n) {
-    if (is_v5_) {
-      return RunV5(samples, n);
-    } else {
-      return RunV4(samples, n);
-    }
-  }
-
   float RunV5(const float *samples, int32_t n) {
     auto memory_info =
         Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
@@ -418,6 +439,7 @@ class SileroVadModel::Impl {
   int64_t sample_rate_;
   int32_t min_silence_samples_;
   int32_t min_speech_samples_;
+  int32_t max_speech_samples_;
 
   bool triggered_ = false;
   int32_t current_sample_ = 0;
@@ -457,12 +479,30 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const {
   return impl_->MinSpeechDurationSamples();
 }
 
+int32_t SileroVadModel::MaxSpeechDurationSamples() {
+  return impl_->MaxSpeechDurationSamples();
+}
+
+float SileroVadModel::Threshold() { return impl_->Threshold(); }
+
 void SileroVadModel::SetMinSilenceDuration(float s) {
   impl_->SetMinSilenceDuration(s);
 }
 
+void SileroVadModel::SetMinSpeechDuration(float s) {
+  impl_->SetMinSpeechDuration(s);
+}
+
 void SileroVadModel::SetThreshold(float threshold) {
   impl_->SetThreshold(threshold);
 }
 
+void SileroVadModel::SetMaxSpeechDuration(float s) {
+  impl_->SetMaxSpeechDuration(s);
+}
+
+float SileroVadModel::Run(const float *samples, int32_t n) {
+  return impl_->Run(samples, n);
+}
+
 }  // namespace sherpa_onnx
diff --git a/sherpa-onnx/csrc/silero-vad-model.h b/sherpa-onnx/csrc/silero-vad-model.h
@@ -37,6 +37,8 @@ class SileroVadModel : public VadModel {
    */
   bool IsSpeech(const float *samples, int32_t n) override;
 
+  float Run(const float *samples, int32_t n);
+
   // For silero vad V4, it is WindowShift().
   // For silero vad V5, it is WindowShift()+64 for 16kHz and
   //                          WindowShift()+32 for 8kHz
@@ -47,9 +49,13 @@ class SileroVadModel : public VadModel {
 
   int32_t MinSilenceDurationSamples() const override;
   int32_t MinSpeechDurationSamples() const override;
+  int32_t MaxSpeechDurationSamples();
+  float Threshold();
 
   void SetMinSilenceDuration(float s) override;
-  void SetThreshold(float threshold) override;
+  void SetMinSpeechDuration(float s);
+  void SetMaxSpeechDuration(float s);  
+  void SetThreshold(float threshold) override;  
 
  private:
   class Impl;