first working version

k2-fsa · Jul 18, 2024 · 44433a3 · 44433a3
1 parent df8f986
commit 44433a3
Show file tree

Hide file tree

Showing 14 changed files with 434 additions and 38 deletions.
diff --git a/.github/scripts/test-offline-ctc.sh b/.github/scripts/test-offline-ctc.sh
@@ -15,7 +15,30 @@ echo "PATH: $PATH"
 
 which $EXE
 
-if false; then
+log "------------------------------------------------------------"
+log "Run SenseVoice models"
+log "------------------------------------------------------------"
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+repo=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+
+for m in model.onnx model.int8.onnx; do
+  for w in zh en yue ja ko; do
+    for use_itn in 0 1; do
+      echo "$m $w $use_itn"
+      time $EXE \
+        --tokens=$repo/tokens.txt \
+        --sense-voice-model=$repo/$m \
+        --sense-voice-use-itn=$use_itn \
+        $repo/test_wavs/$w.wav
+    done
+  done
+done
+
+rm -rf $repo
+
+if true; then
   # It has problems with onnxruntime 1.18
   log "------------------------------------------------------------"
   log "Run Wenet models"

diff --git a/.github/workflows/export-sense-voice-to-onnx.yaml b/.github/workflows/export-sense-voice-to-onnx.yaml
@@ -1,6 +1,9 @@
 name: export-sense-voice-to-onnx
 
 on:
+  push:
+    branches:
+      - cpp-sense-voice
   workflow_dispatch:
 
 concurrency:

diff --git a/scripts/sense-voice/export-onnx.py b/scripts/sense-voice/export-onnx.py
@@ -162,7 +162,9 @@ def main():
         "neg_mean": neg_mean,
         "inv_stddev": inv_stddev,
         "model_type": "sense_voice_ctc",
-        "version": "1",
+        # version 1: Use QInt8
+        # version 2: Use QUInt8
+        "version": "2",
         "model_author": "iic",
         "maintainer": "k2-fsa",
         "vocab_size": vocab_size,
@@ -185,7 +187,10 @@ def main():
         model_input=filename,
         model_output=filename_int8,
         op_types_to_quantize=["MatMul"],
-        weight_type=QuantType.QInt8,
+        # Note that we have to use QUInt8 here.
+        #
+        # When QInt8 is used, C++ onnxruntime produces incorrect results
+        weight_type=QuantType.QUInt8,
     )
 
 

diff --git a/sherpa-onnx/csrc/offline-ctc-model.cc b/sherpa-onnx/csrc/offline-ctc-model.cc
@@ -93,6 +93,7 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,
 
 std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
     const OfflineModelConfig &config) {
+  // TODO(fangjun): Refactor it. We don't need to use model_type here
   ModelType model_type = ModelType::kUnknown;
 
   std::string filename;
@@ -148,6 +149,7 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
 
 std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
     AAssetManager *mgr, const OfflineModelConfig &config) {
+  // TODO(fangjun): Refactor it. We don't need to use model_type here
   ModelType model_type = ModelType::kUnknown;
 
   std::string filename;

diff --git a/sherpa-onnx/csrc/offline-model-config.cc b/sherpa-onnx/csrc/offline-model-config.cc
@@ -119,6 +119,7 @@ std::string OfflineModelConfig::ToString() const {
   os << "tdnn=" << tdnn.ToString() << ", ";
   os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", ";
   os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
+  os << "sense_voice=" << sense_voice.ToString() << ", ";
   os << "telespeech_ctc=\"" << telespeech_ctc << "\", ";
   os << "tokens=\"" << tokens << "\", ";
   os << "num_threads=" << num_threads << ", ";

diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
@@ -27,10 +27,10 @@
 
 namespace sherpa_onnx {
 
-static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
-                                        const SymbolTable &sym_table,
-                                        int32_t frame_shift_ms,
-                                        int32_t subsampling_factor) {
+OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
+                                 const SymbolTable &sym_table,
+                                 int32_t frame_shift_ms,
+                                 int32_t subsampling_factor) {
   OfflineRecognitionResult r;
   r.tokens.reserve(src.tokens.size());
   r.timestamps.reserve(src.timestamps.size());
@@ -212,10 +212,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
     }
   }
 
-  OfflineRecognizerConfig GetConfig() const override {
-    return config_;
-  }
-
+  OfflineRecognizerConfig GetConfig() const override { return config_; }
 
  private:
   // Decode a single stream.

diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc
@@ -21,6 +21,7 @@
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/offline-recognizer-ctc-impl.h"
 #include "sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h"
+#include "sherpa-onnx/csrc/offline-recognizer-sense-voice-impl.h"
 #include "sherpa-onnx/csrc/offline-recognizer-transducer-impl.h"
 #include "sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h"
 #include "sherpa-onnx/csrc/offline-recognizer-whisper-impl.h"
@@ -31,6 +32,28 @@ namespace sherpa_onnx {
 
 std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
     const OfflineRecognizerConfig &config) {
+  if (!config.model_config.sense_voice.model.empty()) {
+    return std::make_unique<OfflineRecognizerSenseVoiceImpl>(config);
+  }
+
+  if (!config.model_config.paraformer.model.empty()) {
+    return std::make_unique<OfflineRecognizerParaformerImpl>(config);
+  }
+
+  if (!config.model_config.nemo_ctc.model.empty() ||
+      !config.model_config.zipformer_ctc.model.empty() ||
+      !config.model_config.tdnn.model.empty() ||
+      !config.model_config.wenet_ctc.model.empty()) {
+    return std::make_unique<OfflineRecognizerCtcImpl>(config);
+  }
+
+  if (!config.model_config.whisper.encoder.empty()) {
+    return std::make_unique<OfflineRecognizerWhisperImpl>(config);
+  }
+
+  // TODO(fangjun): Refactor it. We only need to use model type for the
+  // following models:
+  //  1. transducer and nemo_transducer
   if (!config.model_config.model_type.empty()) {
     const auto &model_type = config.model_config.model_type;
     if (model_type == "transducer") {
@@ -180,6 +203,28 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
 #if __ANDROID_API__ >= 9
 std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
     AAssetManager *mgr, const OfflineRecognizerConfig &config) {
+  if (!config.model_config.sense_voice.model.empty()) {
+    return std::make_unique<OfflineRecognizerSenseVoiceImpl>(mgr, config);
+  }
+
+  if (!config.model_config.paraformer.model.empty()) {
+    return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config);
+  }
+
+  if (!config.model_config.nemo_ctc.model.empty() ||
+      !config.model_config.zipformer_ctc.model.empty() ||
+      !config.model_config.tdnn.model.empty() ||
+      !config.model_type.wenet_ctc.model.empty()) {
+    return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config);
+  }
+
+  if (!config.model_config.whisper.encoder.empty()) {
+    return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config);
+  }
+
+  // TODO(fangjun): Refactor it. We only need to use model type for the
+  // following models:
+  //  1. transducer and nemo_transducer
   if (!config.model_config.model_type.empty()) {
     const auto &model_type = config.model_config.model_type;
     if (model_type == "transducer") {

diff --git a/sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h b/sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h
@@ -102,9 +102,7 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl {
       exit(-1);
     }
 
-    // Paraformer models assume input samples are in the range
-    // [-32768, 32767], so we set normalize_samples to false
-    config_.feat_config.normalize_samples = false;
+    InitFeatConfig();
   }
 
 #if __ANDROID_API__ >= 9
@@ -124,9 +122,7 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl {
       exit(-1);
     }
 
-    // Paraformer models assume input samples are in the range
-    // [-32768, 32767], so we set normalize_samples to false
-    config_.feat_config.normalize_samples = false;
+    InitFeatConfig();
   }
 #endif
 
@@ -211,11 +207,18 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl {
     }
   }
 
-  OfflineRecognizerConfig GetConfig() const override {
-    return config_;
-  }
+  OfflineRecognizerConfig GetConfig() const override { return config_; }
 
  private:
+  void InitFeatConfig() {
+    // Paraformer models assume input samples are in the range
+    // [-32768, 32767], so we set normalize_samples to false
+    config_.feat_config.normalize_samples = false;
+    config_.feat_config.window_type = "hamming";
+    config_.feat_config.high_freq = 0;
+    config_.feat_config.snip_edges = true;
+  }
+
   std::vector<float> ApplyLFR(const std::vector<float> &in) const {
     int32_t lfr_window_size = model_->LfrWindowSize();
     int32_t lfr_window_shift = model_->LfrWindowShift();