Skip to content

Commit

Permalink
first working version
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Jul 18, 2024
1 parent df8f986 commit 44433a3
Show file tree
Hide file tree
Showing 14 changed files with 434 additions and 38 deletions.
25 changes: 24 additions & 1 deletion .github/scripts/test-offline-ctc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,30 @@ echo "PATH: $PATH"

which $EXE

if false; then
log "------------------------------------------------------------"
log "Run SenseVoice models"
log "------------------------------------------------------------"
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
repo=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17

for m in model.onnx model.int8.onnx; do
for w in zh en yue ja ko; do
for use_itn in 0 1; do
echo "$m $w $use_itn"
time $EXE \
--tokens=$repo/tokens.txt \
--sense-voice-model=$repo/$m \
--sense-voice-use-itn=$use_itn \
$repo/test_wavs/$w.wav
done
done
done

rm -rf $repo

if true; then
# It has problems with onnxruntime 1.18
log "------------------------------------------------------------"
log "Run Wenet models"
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/export-sense-voice-to-onnx.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: export-sense-voice-to-onnx

on:
push:
branches:
- cpp-sense-voice
workflow_dispatch:

concurrency:
Expand Down
9 changes: 7 additions & 2 deletions scripts/sense-voice/export-onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def main():
"neg_mean": neg_mean,
"inv_stddev": inv_stddev,
"model_type": "sense_voice_ctc",
"version": "1",
# version 1: Use QInt8
# version 2: Use QUInt8
"version": "2",
"model_author": "iic",
"maintainer": "k2-fsa",
"vocab_size": vocab_size,
Expand All @@ -185,7 +187,10 @@ def main():
model_input=filename,
model_output=filename_int8,
op_types_to_quantize=["MatMul"],
weight_type=QuantType.QInt8,
# Note that we have to use QUInt8 here.
#
# When QInt8 is used, C++ onnxruntime produces incorrect results
weight_type=QuantType.QUInt8,
)


Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/offline-ctc-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,

std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
const OfflineModelConfig &config) {
// TODO(fangjun): Refactor it. We don't need to use model_type here
ModelType model_type = ModelType::kUnknown;

std::string filename;
Expand Down Expand Up @@ -148,6 +149,7 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(

std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
AAssetManager *mgr, const OfflineModelConfig &config) {
// TODO(fangjun): Refactor it. We don't need to use model_type here
ModelType model_type = ModelType::kUnknown;

std::string filename;
Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/csrc/offline-model-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ std::string OfflineModelConfig::ToString() const {
os << "tdnn=" << tdnn.ToString() << ", ";
os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", ";
os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
os << "sense_voice=" << sense_voice.ToString() << ", ";
os << "telespeech_ctc=\"" << telespeech_ctc << "\", ";
os << "tokens=\"" << tokens << "\", ";
os << "num_threads=" << num_threads << ", ";
Expand Down
13 changes: 5 additions & 8 deletions sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@

namespace sherpa_onnx {

static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
const SymbolTable &sym_table,
int32_t frame_shift_ms,
int32_t subsampling_factor) {
OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
const SymbolTable &sym_table,
int32_t frame_shift_ms,
int32_t subsampling_factor) {
OfflineRecognitionResult r;
r.tokens.reserve(src.tokens.size());
r.timestamps.reserve(src.timestamps.size());
Expand Down Expand Up @@ -212,10 +212,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
}
}

OfflineRecognizerConfig GetConfig() const override {
return config_;
}

OfflineRecognizerConfig GetConfig() const override { return config_; }

private:
// Decode a single stream.
Expand Down
45 changes: 45 additions & 0 deletions sherpa-onnx/csrc/offline-recognizer-impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-recognizer-ctc-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-sense-voice-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-transducer-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-whisper-impl.h"
Expand All @@ -31,6 +32,28 @@ namespace sherpa_onnx {

std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
const OfflineRecognizerConfig &config) {
if (!config.model_config.sense_voice.model.empty()) {
return std::make_unique<OfflineRecognizerSenseVoiceImpl>(config);
}

if (!config.model_config.paraformer.model.empty()) {
return std::make_unique<OfflineRecognizerParaformerImpl>(config);
}

if (!config.model_config.nemo_ctc.model.empty() ||
!config.model_config.zipformer_ctc.model.empty() ||
!config.model_config.tdnn.model.empty() ||
!config.model_config.wenet_ctc.model.empty()) {
return std::make_unique<OfflineRecognizerCtcImpl>(config);
}

if (!config.model_config.whisper.encoder.empty()) {
return std::make_unique<OfflineRecognizerWhisperImpl>(config);
}

// TODO(fangjun): Refactor it. We only need to use model type for the
// following models:
// 1. transducer and nemo_transducer
if (!config.model_config.model_type.empty()) {
const auto &model_type = config.model_config.model_type;
if (model_type == "transducer") {
Expand Down Expand Up @@ -180,6 +203,28 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
#if __ANDROID_API__ >= 9
std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
AAssetManager *mgr, const OfflineRecognizerConfig &config) {
if (!config.model_config.sense_voice.model.empty()) {
return std::make_unique<OfflineRecognizerSenseVoiceImpl>(mgr, config);
}

if (!config.model_config.paraformer.model.empty()) {
return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config);
}

if (!config.model_config.nemo_ctc.model.empty() ||
!config.model_config.zipformer_ctc.model.empty() ||
!config.model_config.tdnn.model.empty() ||
!config.model_type.wenet_ctc.model.empty()) {
return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config);
}

if (!config.model_config.whisper.encoder.empty()) {
return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config);
}

// TODO(fangjun): Refactor it. We only need to use model type for the
// following models:
// 1. transducer and nemo_transducer
if (!config.model_config.model_type.empty()) {
const auto &model_type = config.model_config.model_type;
if (model_type == "transducer") {
Expand Down
21 changes: 12 additions & 9 deletions sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,7 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl {
exit(-1);
}

// Paraformer models assume input samples are in the range
// [-32768, 32767], so we set normalize_samples to false
config_.feat_config.normalize_samples = false;
InitFeatConfig();
}

#if __ANDROID_API__ >= 9
Expand All @@ -124,9 +122,7 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl {
exit(-1);
}

// Paraformer models assume input samples are in the range
// [-32768, 32767], so we set normalize_samples to false
config_.feat_config.normalize_samples = false;
InitFeatConfig();
}
#endif

Expand Down Expand Up @@ -211,11 +207,18 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl {
}
}

OfflineRecognizerConfig GetConfig() const override {
return config_;
}
OfflineRecognizerConfig GetConfig() const override { return config_; }

private:
void InitFeatConfig() {
// Paraformer models assume input samples are in the range
// [-32768, 32767], so we set normalize_samples to false
config_.feat_config.normalize_samples = false;
config_.feat_config.window_type = "hamming";
config_.feat_config.high_freq = 0;
config_.feat_config.snip_edges = true;
}

std::vector<float> ApplyLFR(const std::vector<float> &in) const {
int32_t lfr_window_size = model_->LfrWindowSize();
int32_t lfr_window_shift = model_->LfrWindowShift();
Expand Down
Loading

0 comments on commit 44433a3

Please sign in to comment.