diff --git a/.flake8 b/.flake8 index 0f049afbc..d3bbe6b83 100644 --- a/.flake8 +++ b/.flake8 @@ -15,4 +15,5 @@ exclude = ./scripts, ./triton, ./sherpa/python/sherpa/__init__.py, + ./sherpa/csrc/test-data/test-offline-conformer-transducer-model.py, ./sherpa/python/sherpa/decode.py, diff --git a/.github/scripts/run-offline-ctc.sh b/.github/scripts/run-offline-ctc.sh new file mode 100755 index 000000000..6dbc61a86 --- /dev/null +++ b/.github/scripts/run-offline-ctc.sh @@ -0,0 +1,392 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/tokens.txt" +git lfs pull --include "data/lang_bpe_500/HLG.pt" +git lfs pull --include "data/lang_bpe_500/HLG_modified.pt" +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Decoding with HLG" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --hlg=$repo/data/lang_bpe_500/HLG.pt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Decoding with HLG (modified H)" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --hlg=$repo/data/lang_bpe_500/HLG_modified.pt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/wenet-english-model +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "final.zip" +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --normalize-samples=false \ + --modified=true \ + --nn-model=$repo/final.zip \ + --tokens=$repo/units.txt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + + +repo_url=https://huggingface.co/csukuangfj/wenet-chinese-model +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "final.zip" +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --normalize-samples=false \ + --modified=true \ + --nn-model=$repo/final.zip \ + --tokens=$repo/units.txt \ + --use-gpu=false \ + $repo/test_wavs/BAC009S0764W0121.wav \ + $repo/test_wavs/BAC009S0764W0122.wav \ + $repo/test_wavs/BAC009S0764W0123.wav \ + $repo/test_wavs/DEV_T0000000000.wav \ + $repo/test_wavs/DEV_T0000000001.wav \ + $repo/test_wavs/DEV_T0000000002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "wav2vec2_asr_base_10m.pt" +git lfs pull --include "voxpopuli_asr_base_10k_de.pt" +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --nn-model=$repo/wav2vec2_asr_base_10m.pt \ + --tokens=$repo/tokens.txt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Decoding with H (voxpopuli_asr_base_10k_de)" + +./build/bin/sherpa-offline \ + --nn-model=$repo/voxpopuli_asr_base_10k_de.pt \ + --tokens=$repo/tokens-de.txt \ + --use-gpu=false \ + $repo/test_wavs/20120315-0900-PLENARY-14-de_20120315.wav \ + $repo/test_wavs/20170517-0900-PLENARY-16-de_20170517.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/wgb14/icefall-asr-gigaspeech-conformer-ctc +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/HLG.pt" +git lfs pull --include "data/lang_bpe_500/tokens.txt" + +mkdir test_wavs +cd test_wavs +wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1089-134686-0001.wav +wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0001.wav +wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0002.wav + +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Decoding with HLG" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --hlg=$repo/data/lang_bpe_500/HLG.pt \ + --use-gpu=false \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/pkufool/icefall_asr_aishell_conformer_ctc +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_char/HLG.pt" +popd + +log "Decoding with H" +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + --use-gpu=false \ + $repo/test_waves/BAC009S0764W0121.wav \ + $repo/test_waves/BAC009S0764W0122.wav \ + $repo/test_waves/BAC009S0764W0123.wav + +log "Decoding with HLG" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --hlg=$repo/data/lang_char/HLG.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + --use-gpu=false \ + $repo/test_waves/BAC009S0764W0121.wav \ + $repo/test_waves/BAC009S0764W0122.wav \ + $repo/test_waves/BAC009S0764W0123.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_5000/HLG.pt" +git lfs pull --include "data/lang_bpe_5000/tokens.txt" +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_5000/tokens.txt \ + --use-gpu=false \ + $repo/test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281.wav \ + $repo/test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244.wav \ + $repo/test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004.wav + +log "Decoding with HLG" + +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --hlg=$repo/data/lang_bpe_5000/HLG.pt \ + --tokens=$repo/data/lang_bpe_5000/tokens.txt \ + --use-gpu=false \ + $repo/test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281.wav \ + $repo/test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244.wav \ + $repo/test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/videodanchik/icefall-asr-tedlium3-conformer-ctc2 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo + +git lfs pull --include "exp/cpu_jit.pt" + +git lfs pull --include "data/lang_bpe/HLG.pt" +git lfs pull --include "data/lang_bpe/tokens.txt" + +git lfs pull --include "test_wavs/DanBarber_2010-219.wav" +git lfs pull --include "test_wavs/DanielKahneman_2010-157.wav" +git lfs pull --include "test_wavs/RobertGupta_2010U-15.wav" + +popd + +log "Decoding with H" +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe/tokens.txt \ + $repo/test_wavs/DanBarber_2010-219.wav \ + $repo/test_wavs/DanielKahneman_2010-157.wav \ + $repo/test_wavs/RobertGupta_2010U-15.wav + +log "Decoding with HLG" +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --hlg=$repo/data/lang_bpe/HLG.pt \ + --tokens=$repo/data/lang_bpe/tokens.txt \ + $repo/test_wavs/DanBarber_2010-219.wav \ + $repo/test_wavs/DanielKahneman_2010-157.wav \ + $repo/test_wavs/RobertGupta_2010U-15.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/pkufool/icefall_asr_librispeech_conformer_ctc +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe/HLG.pt" +popd + +log "Decoding with H" +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Decoding with HLG" +./build/bin/sherpa-offline \ + --nn-model=$repo/exp/cpu_jit.pt \ + --hlg=$repo/data/lang_bpe/HLG.pt \ + --tokens=$repo/data/lang_bpe/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-citrinet-512 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "model.pt" +popd + +log "Decoding with H" + +./build/bin/sherpa-offline \ + --nn-model=$repo/model.pt \ + --tokens=$repo/tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/sherpa-nemo-ctc-zh-citrinet-512 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "model.pt" +popd + +log "Decoding with H" + +# the vocab size is huge (e.g., >5000), so we use modified=true here +# to avoid OOM in CI +./build/bin/sherpa-offline \ + --nn-model=$repo/model.pt \ + --tokens=$repo/tokens.txt \ + --use-gpu=false \ + --modified=true \ + --nemo-normalize=per_feature \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" diff --git a/.github/scripts/run-offline-sense-voice.sh b/.github/scripts/run-offline-sense-voice.sh new file mode 100755 index 000000000..89a00ea0d --- /dev/null +++ b/.github/scripts/run-offline-sense-voice.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "==========================================================================" +curl -SL -O https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 +tar xvf sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 +rm sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 +ls -lh sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 + +./build/bin/sherpa-offline \ + --debug=1 \ + --sense-voice-model=./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt \ + --tokens=./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt \ + ./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/en.wav \ + ./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/ja.wav \ + ./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/ko.wav \ + ./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/yue.wav \ + ./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav + +rm -rf sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 diff --git a/.github/scripts/run-offline-transducer.sh b/.github/scripts/run-offline-transducer.sh new file mode 100755 index 000000000..b7c6327f5 --- /dev/null +++ b/.github/scripts/run-offline-transducer.sh @@ -0,0 +1,483 @@ +#!/usr/bin/env bash + +# This file test ALL known offline transducer models + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-gigaspeech-pruned-transducer-stateless2 + +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-iter-3488000-avg-15.pt" +git lfs pull --include "data/lang_bpe_500/bpe.model" + +mkdir test_wavs +cd test_wavs +wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1089-134686-0001.wav +wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0001.wav +wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0002.wav + +cd ../exp +ln -s cpu_jit-iter-3488000-avg-15.pt cpu_jit.pt +popd + +./scripts/bpe_model_to_tokens.py $repo/data/lang_bpe_500/bpe.model > $repo/data/lang_bpe_500/tokens.txt + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +# For fast_beam_search with LG +time ./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Test decoding wav.scp" + +.github/scripts/generate_wav_scp.sh + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --use-wav-scp=true \ + scp:wav.scp \ + ark,scp,t:results-$m.ark,results-$m.scp + + head results-$m.scp results-$m.ark +done + +log "Test decoding feats.scp" + +export PYTHONPATH=$HOME/tmp/kaldifeat/build/lib:$HOME/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH + +.github/scripts/generate_feats_scp.py scp:wav.scp ark,scp:feats.ark,feats.scp + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --use-feats-scp=true \ + scp:feats.scp \ + ark,scp,t:results2-$m.ark,results2-$m.scp + + head results2-$m.scp results2-$m.ark +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + + +repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data (aishell) from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt" +git lfs pull --include "data/lang_char/LG.pt" +cd exp +ln -sv cpu_jit-epoch-29-avg-5-torch-1.6.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/BAC009S0764W0121.wav \ + $repo/test_wavs/BAC009S0764W0122.wav \ + $repo/test_wavs/BAC009S0764W0123.wav +done + +./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_char/LG.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/BAC009S0764W0121.wav \ + $repo/test_wavs/BAC009S0764W0122.wav \ + $repo/test_wavs/BAC009S0764W0123.wav + +.github/scripts/generate_wav_scp_aishell.sh + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + --use-wav-scp=true \ + scp:wav_aishell.scp \ + ark,scp,t:results-aishell-$m.ark,results-aishell-$m.scp + + head results-aishell-$m.scp results-aishell-$m.ark +done + +.github/scripts/generate_feats_scp.py scp:wav_aishell.scp ark,scp:feats_aishell.ark,feats_aishell.scp + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + --use-feats-scp=true \ + scp:feats_aishell.scp \ + ark,scp,t:results-aishell2-$m.ark,results-aishell2-$m.scp + + head results-aishell2-$m.scp results-aishell2-$m.ark +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-torch-1.10.0.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +cd exp +ln -s cpu_jit-torch-1.10.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 + +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-torch-1.10.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +cd exp +rm cpu_jit.pt +ln -sv cpu_jit-torch-1.10.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +time ./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + + +repo_url=https://huggingface.co/csukuangfj/icefall_asr_wenetspeech_pruned_transducer_stateless2 + +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit_epoch_10_avg_2_torch_1.13.0.pt" +git lfs pull --include "data/lang_char/LG.pt" +cd exp +ln -s cpu_jit_epoch_10_avg_2_torch_1.13.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/DEV_T0000000000.wav \ + $repo/test_wavs/DEV_T0000000001.wav \ + $repo/test_wavs/DEV_T0000000002.wav +done + +./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_char/LG.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/DEV_T0000000000.wav \ + $repo/test_wavs/DEV_T0000000001.wav \ + $repo/test_wavs/DEV_T0000000002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit_torch.1.13.0.pt" +cd exp +ln -sv cpu_jit_torch.1.13.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/T0055G0036S0002.wav \ + $repo/test_wavs/T0055G0036S0003.wav \ + $repo/test_wavs/T0055G0036S0004.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall_asr_tal-csasr_pruned_transducer_stateless5 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/210_36476_210_8341_1_1533271973_7057520_132.wav \ + $repo/test_wavs/210_36476_210_8341_1_1533271973_7057520_138.wav \ + $repo/test_wavs/210_36476_210_8341_1_1533271973_7057520_145.wav \ + $repo/test_wavs/210_36476_210_8341_1_1533271973_7057520_148.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/a_0_cacm-A70_31116.wav \ + $repo/test_wavs/a_0_cacm-A70_31117.wav \ + $repo/test_wavs/a_0_cacm-A70_31118.wav +done + +# For fast_beam_search with LG +time ./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/a_0_cacm-A70_31116.wav \ + $repo/test_wavs/a_0_cacm-A70_31117.wav \ + $repo/test_wavs/a_0_cacm-A70_31118.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "data/lang_bpe_500/LG.pt" +git lfs pull --include "data/lang_bpe_500/tokens.txt" +git lfs pull --include "exp/cpu_jit-epoch-28-avg-23-torch-1.10.0.pt" + +git lfs pull --include "test_wavs/a_0_cacm-A70_31116.wav" +git lfs pull --include "test_wavs/a_0_cacm-A70_31117.wav" +git lfs pull --include "test_wavs/a_0_cacm-A70_31118.wav" + +cd exp +rm cpu_jit.pt +ln -sv cpu_jit-epoch-28-avg-23-torch-1.10.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/a_0_cacm-A70_31116.wav \ + $repo/test_wavs/a_0_cacm-A70_31117.wav \ + $repo/test_wavs/a_0_cacm-A70_31118.wav +done + +time ./build/bin/sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/a_0_cacm-A70_31116.wav \ + $repo/test_wavs/a_0_cacm-A70_31117.wav \ + $repo/test_wavs/a_0_cacm-A70_31118.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-offline \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/165.wav \ + $repo/test_wavs/74.wav \ + $repo/test_wavs/209.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" diff --git a/.github/scripts/run-offline-whisper.sh b/.github/scripts/run-offline-whisper.sh new file mode 100755 index 000000000..4e04358b1 --- /dev/null +++ b/.github/scripts/run-offline-whisper.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "==========================================================================" +model_list=( +base +base.en +distil-large-v2 +distil-medium.en +distil-small.en +medium +medium.en +small +small.en +tiny +tiny.en +turbo +) + +curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-zh-wenet-aishell2/resolve/main/test_wavs/0.wav +mv 0.wav zh.wav + +for m in ${model_list[@]}; do + d=sherpa-whisper-$m + log "----------testing $d----------" + curl -SL -O https://github.com/k2-fsa/sherpa/releases/download/asr-models/$d.tar.bz2 + tar xvf $d.tar.bz2 + rm $d.tar.bz2 + ls -lh $d + + if [[ $d == *en ]]; then + log "decode a single file" + + ./build/bin/sherpa-offline \ + --debug=1 \ + --whisper-model=./$d/model.pt \ + --tokens=./$d/tokens.txt \ + ./$d/test_wavs/0.wav + + log "decode two files" + ./build/bin/sherpa-offline \ + --debug=1 \ + --whisper-model=./$d/model.pt \ + --tokens=./$d/tokens.txt \ + ./$d/test_wavs/0.wav \ + ./$d/test_wavs/1.wav + fi + + if [[ $d != *en ]]; then + + log "decode a single file" + + ./build/bin/sherpa-offline \ + --debug=1 \ + --whisper-model=./$d/model.pt \ + --tokens=./$d/tokens.txt \ + ./$d/test_wavs/0.wav + + log "decode two files" + ./build/bin/sherpa-offline \ + --debug=1 \ + --whisper-model=./$d/model.pt \ + --tokens=./$d/tokens.txt \ + ./$d/test_wavs/0.wav \ + ./$d/test_wavs/1.wav + + log "decode three files" + ./build/bin/sherpa-offline \ + --debug=1 \ + --whisper-model=./$d/model.pt \ + --tokens=./$d/tokens.txt \ + ./$d/test_wavs/0.wav \ + ./$d/test_wavs/1.wav \ + ./zh.wav + fi + rm -rf $d +done diff --git a/.github/scripts/run-online-transducer.sh b/.github/scripts/run-online-transducer.sh new file mode 100755 index 000000000..ff84f4b52 --- /dev/null +++ b/.github/scripts/run-online-transducer.sh @@ -0,0 +1,313 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "==========================================================================" + +repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +# For fast_beam_search with LG +time ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" + +log "==========================================================================" +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu-jit-epoch-30-avg-10-torch-1.13.0.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +cd exp +ln -sv cpu-jit-epoch-30-avg-10-torch-1.13.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +# For fast_beam_search with LG + +time ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/encoder_jit_trace-iter-468000-avg-16.pt" +git lfs pull --include "exp/decoder_jit_trace-iter-468000-avg-16.pt" +git lfs pull --include "exp/joiner_jit_trace-iter-468000-avg-16.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" + +cd exp +ln -sv encoder_jit_trace-iter-468000-avg-16.pt encoder_jit_trace.pt +ln -sv decoder_jit_trace-iter-468000-avg-16.pt decoder_jit_trace.pt +ln -sv joiner_jit_trace-iter-468000-avg-16.pt joiner_jit_trace.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --encoder-model=$repo/exp/encoder_jit_trace.pt \ + --decoder-model=$repo/exp/decoder_jit_trace.pt \ + --joiner-model=$repo/exp/joiner_jit_trace.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +# For fast_beam_search with LG +time ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --encoder-model=$repo/exp/encoder_jit_trace.pt \ + --decoder-model=$repo/exp/decoder_jit_trace.pt \ + --joiner-model=$repo/exp/joiner_jit_trace.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +cd exp +ln -sv cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav +done + +# For fast_beam_search with LG + +time ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +pushd $repo/test_wavs +sox 1089-134686-0001.wav 1.wav pad 5 5 +sox 1221-135766-0001.wav 2.wav pad 5 5 +sox 1221-135766-0002.wav 3.wav pad 5 5 +sox 1.wav 2.wav 3.wav all-in-one.wav +soxi *.wav +ls -lh *.wav +popd + +# For Endpoint testing +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + --use-endpoint=true \ + $repo/test_wavs/all-in-one.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-epoch-25-avg-3.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" +cd exp +ln -sv cpu_jit-epoch-25-avg-3.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_waves/1089-134686-0001.wav \ + $repo/test_waves/1221-135766-0001.wav \ + $repo/test_waves/1221-135766-0002.wav +done + +# For fast_beam_search with LG + +time ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_bpe_500/LG.pt \ + --tokens=$repo/data/lang_bpe_500/tokens.txt \ + $repo/test_waves/1089-134686-0001.wav \ + $repo/test_waves/1221-135766-0001.wav \ + $repo/test_waves/1221-135766-0002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit_epoch_7_avg_1_torch.1.13.0.pt" +git lfs pull --include "data/lang_char/LG.pt" +cd exp +ln -sv cpu_jit_epoch_7_avg_1_torch.1.13.0.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/DEV_T0000000000.wav \ + $repo/test_wavs/DEV_T0000000001.wav \ + $repo/test_wavs/DEV_T0000000002.wav +done + +# For fast_beam_search with LG + +time ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=$repo/exp/cpu_jit.pt \ + --lg=$repo/data/lang_char/LG.pt \ + --tokens=$repo/data/lang_char/tokens.txt \ + $repo/test_wavs/DEV_T0000000000.wav \ + $repo/test_wavs/DEV_T0000000001.wav \ + $repo/test_wavs/DEV_T0000000002.wav + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-conv-emformer-transducer-stateless2-zh +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit-epoch-11-avg-1.pt" +cd exp +ln -sv cpu_jit-epoch-11-avg-1.pt cpu_jit.pt +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char_bpe/tokens.txt \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav \ + $repo/test_wavs/3.wav \ + $repo/test_wavs/4.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" + +repo_url=https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +popd + +for m in greedy_search modified_beam_search fast_beam_search; do + time ./build/bin/sherpa-online \ + --decoding-method=$m \ + --nn-model=$repo/exp/cpu_jit.pt \ + --tokens=$repo/data/lang_char_bpe/tokens.txt \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav \ + $repo/test_wavs/3.wav \ + $repo/test_wavs/4.wav +done + +rm -rf $repo +log "End of testing ${repo_url}" +log "==========================================================================" diff --git a/.github/scripts/run-python-test.sh b/.github/scripts/run-python-test.sh new file mode 100755 index 000000000..7ae0738be --- /dev/null +++ b/.github/scripts/run-python-test.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "Download pretrained models" + +mkdir -p /tmp/icefall-models +pushd /tmp/icefall-models + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/tokens.txt" +git lfs pull --include "data/lang_bpe_500/HLG.pt" +popd + +repo_url=https://huggingface.co/csukuangfj/wenet-english-model +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "final.zip" +popd + +repo_url=https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "wav2vec2_asr_base_960h.pt" +popd + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model and test-data from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "exp/cpu_jit.pt" +popd + +# Go to /path/to/sherpa +popd + +cd sherpa/python/test + +pip install pytest + +pytest -s -v diff --git a/.github/scripts/test-offline-websocket-rtf-wer.sh b/.github/scripts/test-offline-websocket-rtf-wer.sh new file mode 100755 index 000000000..fb55354c1 --- /dev/null +++ b/.github/scripts/test-offline-websocket-rtf-wer.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +set -e + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "Install lhotse" + +python3 -m pip install lhotse websockets + +export KALDIFST_MAKE_ARGS="-j4" +log "Install icefall" +git clone http://github.com/k2-fsa/icefall +pushd icefall +pip install -r ./requirements.txt +popd + +export PYTHONPATH=$PWD/icefall:$PYTHONPATH + +log "Downloading pre-trained model from $repo_url" + +repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +repo=$(basename $repo_url) + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo + +git lfs pull --include "exp/cpu_jit.pt" +ls -lh ./exp/cpu_jit.pt +ls -lh ./data/lang_bpe_500/tokens.txt + +popd + +log "Downloading test-clean" + +wget -q --no-check-certificate https://www.openslr.org/resources/12/test-clean.tar.gz +tar xf test-clean.tar.gz +rm test-clean.tar.gz +ls -lh LibriSpeech + +mkdir -p data/manifests +lhotse prepare librispeech -j 2 -p test-clean $PWD/LibriSpeech data/manifests +ls -lh data/manifests + +lhotse cut simple \ + -r ./data/manifests/librispeech_recordings_test-clean.jsonl.gz \ + -s ./data/manifests/librispeech_supervisions_test-clean.jsonl.gz \ + test-clean.jsonl.gz + +ls -lh test-clean.jsonl.gz + +log "Build sherpa" + +mkdir build +cd build +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DSHERPA_ENABLE_WEBSOCKET=ON \ + .. + +make -j4 sherpa-offline-websocket-server sherpa-offline-websocket-client + +ls -lh lib +ls -lh bin + +cd .. + +log "start the sever" + +./build/bin/sherpa-offline-websocket-server \ + --use-gpu=false \ + --port=6006 \ + --num-io-threads=2 \ + --num-work-threads=2 \ + --max-batch-size=5 \ + --nn-model=./$repo/exp/cpu_jit.pt \ + --tokens=./$repo/data/lang_bpe_500/tokens.txt \ + --decoding-method=$DECODING_METHOD \ + --doc-root=./sherpa/bin/web \ + --log-file=./log.txt & + +log "Sleep 10 seconds to wait for the server startup" +sleep 10 +cat ./log.txt + +log "start the client" + +# We create 50 concurrent connections here +time python3 ./sherpa/bin/decode_manifest.py \ + --server-addr 127.0.0.1 \ + --server-port 6006 \ + --manifest-filename ./test-clean.jsonl.gz \ + --num-tasks $NUM_CONNECTIONS diff --git a/.github/scripts/test-online-websocket-rtf-wer.sh b/.github/scripts/test-online-websocket-rtf-wer.sh new file mode 100755 index 000000000..a57bbfa57 --- /dev/null +++ b/.github/scripts/test-online-websocket-rtf-wer.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash + +set -e + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "Install lhotse" + +python3 -m pip install lhotse websockets + +export KALDIFST_MAKE_ARGS="-j4" +log "Install icefall" +git clone http://github.com/k2-fsa/icefall +pushd icefall +pip install -r ./requirements.txt +popd + +export PYTHONPATH=$PWD/icefall:$PYTHONPATH + +repo_url=https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless5_20220729 +repo=$(basename $repo_url) + +log "Downloading pre-trained model from $repo_url" + +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo + +git lfs pull --include "exp/cpu_jit-epoch-25-avg-5.pt" +cd exp +ln -s cpu_jit-epoch-25-avg-5.pt cpu_jit.pt +cd .. + +ln -s exp/cpu_jit-epoch-25-avg-5.pt +ls -lh ./exp/cpu_jit.pt +ls -lh ./data/lang_bpe_500/tokens.txt + +popd + +log "Downloading test-clean" + +wget -q --no-check-certificate https://www.openslr.org/resources/12/test-clean.tar.gz +tar xf test-clean.tar.gz +rm test-clean.tar.gz +ls -lh LibriSpeech + +mkdir -p data/manifests +lhotse prepare librispeech -j 2 -p test-clean $PWD/LibriSpeech data/manifests +ls -lh data/manifests + +lhotse cut simple \ + -r ./data/manifests/librispeech_recordings_test-clean.jsonl.gz \ + -s ./data/manifests/librispeech_supervisions_test-clean.jsonl.gz \ + test-clean.jsonl.gz + +ls -lh test-clean.jsonl.gz + +log "Build sherpa" + +mkdir build +cd build +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DSHERPA_ENABLE_WEBSOCKET=ON \ + .. + +make -j4 sherpa-online-websocket-server sherpa-online-websocket-client + +ls -lh lib +ls -lh bin + +cd .. + +log "start the sever" + +./build/bin/sherpa-online-websocket-server \ + --use-gpu=false \ + --port=6006 \ + --num-io-threads=2 \ + --num-work-threads=2 \ + --max-batch-size=5 \ + --loop-interval-ms=5 \ + --nn-model=./$repo/exp/cpu_jit.pt \ + --tokens=./$repo/data/lang_bpe_500/tokens.txt \ + --decoding-method=$DECODING_METHOD \ + --doc-root=./sherpa/bin/web \ + --log-file=./log.txt & + + +log "Sleep 10 seconds to wait for the server startup" +sleep 10 +cat ./log.txt + +log "start the client" + +# We create 50 concurrent connections here +time python3 ./sherpa/bin/decode_manifest.py \ + --server-addr 127.0.0.1 \ + --server-port 6006 \ + --manifest-filename ./test-clean.jsonl.gz \ + --num-tasks $NUM_CONNECTIONS diff --git a/.github/workflows/build-conda-cpu.yaml b/.github/workflows/build-conda-cpu.yaml deleted file mode 100644 index a08329198..000000000 --- a/.github/workflows/build-conda-cpu.yaml +++ /dev/null @@ -1,133 +0,0 @@ -name: build_conda_cpu - -on: - push: - tags: - - '*' - -jobs: - generate_build_matrix: - # see https://github.com/pytorch/pytorch/pull/50633 - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Generating build matrix - id: set-matrix - run: | - # outputting for debugging purposes - python scripts/github_actions/generate_build_matrix.py - MATRIX=$(python scripts/github_actions/generate_build_matrix.py) - echo "::set-output name=matrix::${MATRIX}" - - build_conda_cpu: - needs: generate_build_matrix - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} - - steps: - # refer to https://github.com/actions/checkout - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: sherpa - - - name: Install conda dependencies - shell: bash -l {0} - run: | - conda install -y -q anaconda-client - conda install -y -q conda-build - conda install -y -q -c k2-fsa -c kaldifeat -c kaldi_native_io -c pytorch k2 kaldifeat kaldi_native_io pytorch=${{ matrix.torch }} cpuonly - - - name: Display MKL - if: startsWith(matrix.os, 'macos') || startsWith(matrix.os, 'ubuntu') - shell: bash -l {0} - run: | - ls -lh $CONDA_PREFIX/lib/libmkl* - - - name: Display Python version - shell: bash -l {0} - run: | - python -c "import sys; print(sys.version)" - which python - - - name: Display conda info - shell: bash -l {0} - run: | - conda env list - conda info - which conda - python --version - which python - python -m torch.utils.collect_env - - - name: Build sherpa - if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'macos') - shell: bash -l {0} - env: - SHERPA_PYTHON_VERSION: ${{ matrix.python-version}} - SHERPA_TORCH_VERSION: ${{ matrix.torch }} - SHERPA_CONDA_TOKEN: ${{ secrets.SHERPA_CONDA_TOKEN}} - run: | - ./scripts/build_conda_cpu.sh - - - name: Build sherpa - if: startsWith(matrix.os, 'windows') - shell: bash -l {0} - env: - SHERPA_PYTHON_VERSION: ${{ matrix.python-version}} - SHERPA_TORCH_VERSION: ${{ matrix.torch }} - SHERPA_CONDA_TOKEN: ${{ secrets.SHERPA_CONDA_TOKEN}} - run: | - # ./scripts/build_conda_cpu_windows.sh - ./scripts/build_conda_cpu.sh - - - name: Display generated files - if: startsWith(matrix.os, 'ubuntu') - run: | - ls -lh /usr/share/miniconda/envs/sherpa/conda-bld/linux-64 - - - name: Upload generated files - if: startsWith(matrix.os, 'ubuntu') - uses: actions/upload-artifact@v2 - with: - name: cpu-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-${{ matrix.os }} - path: /usr/share/miniconda/envs/sherpa/conda-bld/linux-64/*.tar.bz2 - - - name: Display generated files - if: startsWith(matrix.os, 'windows') - shell: bash -l {0} - run: | - ls -lh /c/Miniconda/envs/sherpa/conda-bld - ls -lh /c/Miniconda/envs/sherpa/conda-bld/*/* - ls -lh /c/Miniconda/envs/sherpa/conda-bld/win-64/* - - - name: Upload generated files - if: startsWith(matrix.os, 'windows') - uses: actions/upload-artifact@v2 - with: - name: cpu-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-${{ matrix.os }} - path: c:/Miniconda/envs/sherpa/conda-bld/win-64/*.tar.bz2 - - - name: Display generated files - if: startsWith(matrix.os, 'macos') - run: | - ls -lh /usr/local/miniconda/envs/sherpa/conda-bld/osx-64 - - - name: Upload generated files - if: startsWith(matrix.os, 'macos') - uses: actions/upload-artifact@v2 - with: - name: cpu-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-${{ matrix.os }} - path: /usr/local/miniconda/envs/sherpa/conda-bld/osx-64/*.tar.bz2 diff --git a/.github/workflows/build-doc.yml b/.github/workflows/build-doc.yml index 6cd2a8645..855996448 100644 --- a/.github/workflows/build-doc.yml +++ b/.github/workflows/build-doc.yml @@ -21,8 +21,26 @@ name: Generate doc on: push: branches: - - master - - doc + - master + - doc + paths: + - '.github/workflows/build-doc.yml' + - 'docs/**' + + # schedule: + # # minute (0-59) + # # hour (0-23) + # # day of the month (1-31) + # # month (1-12) + # # day of the week (0-6) + # # nightly build at 23:50 UTC time every day + # - cron: "50 23 * * *" + + workflow_dispatch: + +concurrency: + group: build-doc-${{ github.ref }} + cancel-in-progress: true jobs: build-doc: @@ -30,76 +48,273 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-18.04] - torch: ["1.7.1"] - python-version: [3.8] + os: [ubuntu-latest] + python-version: ["3.10"] steps: # refer to https://github.com/actions/checkout - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Install GCC 7 + - name: Copy wave files + shell: bash run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV + cd docs/source - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + git lfs install + git clone https://www.modelscope.cn/csukuangfj/sherpa-doc-files.git ms + ls -lh _static/* + cp -av ms/source/_static/* ./_static/ + rm -rf ms + + - name: Run docker + uses: addnab/docker-run-action@v3 with: - python-version: ${{ matrix.python-version }} + image: reitzig/texlive-full:latest + # image: ghcr.io/xu-cheng/texlive-full:latest + options: | + --volume ${{ github.workspace }}/:/workspace + shell: bash + run: | + uname -a + cat /etc/*release + + id + pwd + ls -lh + + cd /workspace + which latexmk + + apk add --no-cache python3 py3-pip git make gcc zlib-dev libffi-dev openssl-dev musl-dev ghostscript curl + python3 --version + + + python3 -m venv abc + source ./abc/bin/activate + python3 -m ensurepip + + python3 -m pip install -r ./docs/requirements.txt + + cd docs + + # Download test wave files for SenseVoice + mkdir -p source/_static/sense-voice + pushd source/_static/sense-voice + rm .gitignore + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/zh.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/en.wav + ls -lh + popd + + make html + + touch build/html/.nojekyll + + export GIT_LFS_SKIP_SMUDGE=1 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-apk huggingface + cd huggingface + ./generate-asr.py + ./generate-asr-2pass.py + ./generate-tts.py + ./generate-tts-engine.py + ./generate-speaker-identification.py + ./generate-speaker-diarization.py + ./generate-audio-tagging.py + ./generate-audio-tagging-wearos.py + ./generate-slid.py + ./generate-kws.py + ./generate-vad.py + ./generate-vad-asr.py + + mv -v apk-asr.html ../build/html/onnx/android/apk.html + mv -v apk-asr-2pass.html ../build/html/onnx/android/apk-2pass.html + mv -v apk.html ../build/html/onnx/tts/ + mv -v apk-engine.html ../build/html/onnx/tts/ + mv -v apk-speaker-identification.html ../build/html/onnx/speaker-identification/apk.html + mv -v apk-speaker-diarization.html ../build/html/onnx/speaker-diarization/apk.html + mv -v apk-audio-tagging.html ../build/html/onnx/audio-tagging/apk.html + mv -v apk-audio-tagging-wearos.html ../build/html/onnx/audio-tagging/apk-wearos.html + mv -v apk-slid.html ../build/html/onnx/spoken-language-identification/apk.html + mv -v apk-kws.html ../build/html/onnx/kws/apk.html + mv -v apk-vad.html ../build/html/onnx/vad/apk.html + mv -v apk-vad-asr.html ../build/html/onnx/vad/apk-asr.html + + mv -v apk-asr-cn.html ../build/html/onnx/android/apk-cn.html + mv -v apk-asr-2pass-cn.html ../build/html/onnx/android/apk-2pass-cn.html + mv -v apk-cn.html ../build/html/onnx/tts/ + mv -v apk-engine-cn.html ../build/html/onnx/tts/ + mv -v apk-speaker-identification-cn.html ../build/html/onnx/speaker-identification/apk-cn.html + mv -v apk-speaker-diarization-cn.html ../build/html/onnx/speaker-diarization/apk-cn.html + mv -v apk-audio-tagging-cn.html ../build/html/onnx/audio-tagging/apk-cn.html + mv -v apk-audio-tagging-wearos-cn.html ../build/html/onnx/audio-tagging/apk-wearos-cn.html + mv -v apk-slid-cn.html ../build/html/onnx/spoken-language-identification/apk-cn.html + mv -v apk-kws-cn.html ../build/html/onnx/kws/apk-cn.html + mv -v apk-vad-cn.html ../build/html/onnx/vad/apk-cn.html + mv -v apk-vad-asr-cn.html ../build/html/onnx/vad/apk-asr-cn.html + + cd .. + rm -rf huggingface + + git clone https://huggingface.co/csukuangfj/sherpa huggingface + cd huggingface + ./run.sh + cp cpu.html ../build/html + cp cuda.html ../build/html + cd .. + rm -rf huggingface + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-flutter huggingface + cd huggingface + ./generate-asr.py + ./generate-tts.py + mkdir -p ../build/html/onnx/flutter/asr + mv -v app-asr.html ../build/html/onnx/flutter/asr/app.html + mv -v app-asr-cn.html ../build/html/onnx/flutter/asr/app-cn.html + mv -v tts*.html ../build/html/onnx/flutter/ + cd .. + rm -rf huggingface + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-bin huggingface + cd huggingface + ./build-generate-subtitles.py + mv -v download-generated-subtitles.html ../build/html/onnx/lazarus/ + mv -v download-generated-subtitles-cn.html ../build/html/onnx/lazarus/ + cd .. + rm -rf huggingface + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-wheels huggingface + cd huggingface + ./run.sh + mv -v *.html ../build/html/onnx/ + cd .. + rm -rf huggingface + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-harmony-os huggingface + cd huggingface + ./generate-vad-asr.py + + mkdir -p ../build/html/onnx/harmony-os/hap + mv -v vad-asr.html ../build/html/onnx/harmony-os/hap/ + mv -v vad-asr-cn.html ../build/html/onnx/harmony-os/hap/ - - name: Display Python version - run: python -c "import sys; print(sys.version)" + cd .. + rm -rf huggingface - - name: Install PyTorch ${{ matrix.torch }} + pushd source/ncnn/tutorials + sed -i.bak /cn\.rst/d ./index.rst + popd + + pushd source/onnx/tutorials + sed -i.bak /cn\.rst/d ./index.rst + popd + + pushd source/onnx/pretrained_models/offline-transducer/ + sed -i.bak /sherpa-onnx-zipformer-thai-2024-06-20\.txt/d zipformer-transducer-models.rst + sed -i.bak /sherpa-onnx-zipformer-thai-2024-06-20-int8\.txt/d zipformer-transducer-models.rst + + sed -i.bak /sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01\.txt/d zipformer-transducer-models.rst + sed -i.bak /sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-int8\.txt/d zipformer-transducer-models.rst + + sed -i.bak /sherpa-onnx-zipformer-ru-2024-09-18\.txt/d zipformer-transducer-models.rst + sed -i.bak /sherpa-onnx-small-zipformer-ru-2024-09-18\.txt/d zipformer-transducer-models.rst + + sed -i.bak /sherpa-onnx-zipformer-ru-2024-09-18\.int8\.txt/d zipformer-transducer-models.rst + sed -i.bak /sherpa-onnx-small-zipformer-ru-2024-09-18\.int8\.txt/d zipformer-transducer-models.rst + + popd + + pushd source/onnx/pretrained_models/offline-ctc/nemo/ + + sed -i.bak /sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24\.int8\.txt/d russian.rst + + popd + + pushd source/onnx/pretrained_models/offline-transducer + + sed -i.bak /sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24\.int8\.txt/d nemo-transducer-models.rst + + popd + + git diff + + make latex + mkdir -pv build/html/_static + cd build/latex + #latexmk -interaction=nonstopmode -f -pdf -dvi- -ps- sherpa.tex || true + latexmk -interaction=nonstopmode -f -pdf -dvi- -ps- sherpa.tex >/dev/null 2>&1 || true + if [ -f sherpa.pdf ]; then + ls -lh sherpa.pdf + cp -v sherpa.pdf /workspace/ + else + echo "skip copying pdf" + fi + + - name: View generated files shell: bash run: | - python3 -m pip install -qq --upgrade pip kaldi_native_io - python3 -m pip install -qq wheel twine typing_extensions websockets sentencepiece>=0.1.96 soundfile - python3 -m pip install -qq torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/torch_stable.html + cd docs/build/html + + ls -lh - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html + echo "-----_static-----" + ls -lh _static - python3 -c "import torch; print('torch version:', torch.__version__)" + ls -lha _static/kokoro-multi-*/ - python3 -m torch.utils.collect_env + rm -fv _static/kokoro-multi-*/.gitignore - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 + echo "---" + ls -lha _static/kokoro-multi-*/ + + echo "-----_static/sense-voice-----" + ls -lh _static/sense-voice + + - name: Release sherpa.pdf + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') + uses: svenstaro/upload-release-action@v2 with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} + file_glob: true + overwrite: true + file: ./sherpa.pdf + tag: doc - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false - python3 setup.py install + git clone https://huggingface.co/csukuangfj/sherpa-doc huggingface + cd huggingface - - name: Build doc - shell: bash - run: | - cd docs - python3 -m pip install -r ./requirements.txt - make html - touch build/html/.nojekyll + if [ -f ../sherpa.pdf ]; then + cp -v ../sherpa.pdf ./ + else + echo "skip copying pdf" + fi + git status + git lfs track "*.pdf" + git add . + git commit -m "update doc" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-doc main - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./docs/build/html + force_orphan: true publish_branch: gh-pages diff --git a/.github/workflows/export-3d-speaker.yml b/.github/workflows/export-3d-speaker.yml new file mode 100644 index 000000000..3e7276af8 --- /dev/null +++ b/.github/workflows/export-3d-speaker.yml @@ -0,0 +1,95 @@ +name: export-3d-speaker + +on: + push: + branches: + - export-3d-speaker + workflow_dispatch: + +concurrency: + group: export-3d-speaker-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-3d-speaker: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export 3d speaker + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + pip install torch==2.0.0 torchaudio==2.0.1 modelscope kaldi-native-fbank numpy==1.26.4 addict datasets librosa soundfile + + cd /tmp + + pushd /tmp + git clone https://github.com/alibaba-damo-academy/3D-Speaker + cd 3D-Speaker + pip install -r ./requirements.txt + popd + + - name: Export + shell: bash + run: | + pushd scripts/3d-speaker + export PYTHONPATH=/tmp/3D-Speaker:PYTHONPATH + ./run.sh + + - name: Collect results + shell: bash + run: | + mv -v scripts/3d-speaker/*.pt ./ + ls -lh *.pt + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.pt + overwrite: true + repo_name: k2-fsa/sherpa + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }} + tag: speaker-recognition-models + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models huggingface + + mkdir -p ./huggingface/speaker-recognition-models + + cp -av ./*.pt ./huggingface/speaker-recognition-models + + cd huggingface + + git status + ls -lh + git lfs track "*.pt*" + + git add . + git commit -m "add some models from 3d-speaker" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models main || true diff --git a/.github/workflows/export-pyannote.yaml b/.github/workflows/export-pyannote.yaml new file mode 100644 index 000000000..87795cf7d --- /dev/null +++ b/.github/workflows/export-pyannote.yaml @@ -0,0 +1,165 @@ +name: export-pyannote + +on: + push: + branches: + - export-pyannote + workflow_dispatch: + +concurrency: + group: export-pyannote-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-pyannote: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export ${{ matrix.model }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + model: ['pyannote'] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install pyannote + shell: bash + run: | + pip install torch==2.2.0 torchaudio==2.2.0 onnxruntime onnx kaldi-native-fbank funasr numpy==1.26.4 pyannote.audio==3.3.0 + + - name: Export ${{ matrix.model }} + shell: bash + run: | + pushd scripts/pyannote/segmentation + model=${{ matrix.model }} + if [[ $model == 'pyannote' ]]; then + curl -SL -O https://huggingface.co/csukuangfj/pyannote-models/resolve/main/segmentation-3.0/pytorch_model.bin + else + curl -SL -O https://huggingface.co/openspeech/revai-models/resolve/main/v1/pytorch_model.bin + fi + + python3 ./export.py + + ls -lh + + - name: Test ${{ matrix.model }} + shell: bash + run: | + pushd scripts/pyannote/segmentation + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + + python3 ./vad.py --model ./model.pt --wav ./lei-jun-test.wav + + - name: Collect results + shell: bash + run: | + model=${{ matrix.model }} + if [[ $model == 'pyannote' ]]; then + d=sherpa-pyannote-segmentation-3-0 + else + d=sherpa-reverb-diarization-v1 + fi + mkdir $d + mv -v scripts/pyannote/segmentation/model.pt $d/ + mv -v scripts/pyannote/segmentation/README.md $d/ + mv -v scripts/pyannote/segmentation/LICENSE $d/ + + if [[ $model == revai ]]; then + echo "Models in this folder are converted from https://huggingface.co/Revai/reverb-diarization-v1" > $d/README.md + fi + + cat $d/README.md + + ls -lh $d + tar cjvf $d.tar.bz2 $d + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }} + tag: speaker-segmentation-models + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + model=${{ matrix.model }} + if [[ $model == 'pyannote' ]]; then + src=sherpa-pyannote-segmentation-3-0 + else + src=sherpa-reverb-diarization-v1 + fi + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src huggingface + + rm -rf huggingface/* + + cp -av $src/* ./huggingface/ + + cd huggingface + + git status + ls -lh + git lfs track "*.pt*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src main || true + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + model=${{ matrix.model }} + if [[ $model == 'pyannote' ]]; then + src=sherpa-pyannote-segmentation-3-0 + else + src=sherpa-reverb-diarization-v1 + fi + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models huggingface + + mkdir -p ./huggingface/speaker-segmentation + + cp -av $src.tar.bz2 ./huggingface/speaker-segmentation + + cd huggingface + + git status + ls -lh + git lfs track "*.tar.bz2*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models main || true + diff --git a/.github/workflows/export-sense-voice.yml b/.github/workflows/export-sense-voice.yml new file mode 100644 index 000000000..fcc1014ca --- /dev/null +++ b/.github/workflows/export-sense-voice.yml @@ -0,0 +1,142 @@ +name: export-sense-voice + +on: + push: + branches: + - export-sense-voice + workflow_dispatch: + +concurrency: + group: export-sense-voice-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-sense-voice: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export sense voice + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + pip install torch==2.0.0 torchaudio==2.0.1 onnxruntime onnx kaldi-native-fbank funasr numpy==1.26.4 + + cd /tmp + + pushd /tmp + git clone --depth 1 https://github.com/modelscope/FunASR + popd + + - name: Export + shell: bash + run: | + pushd scripts/sense-voice + export PYTHONPATH=/tmp/FunASR/runtime/python/libtorch:PYTHONPATH + ./run.sh + + - name: Collect results + shell: bash + run: | + d=sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 + mkdir $d + mv -v scripts/sense-voice/model.pt $d/ + mv -v scripts/sense-voice/tokens.txt $d/ + mv -v scripts/sense-voice/README.md $d/ + mv -v scripts/sense-voice/bpe.model $d/ + + pushd $d + mkdir test_wavs + cd test_wavs + wget https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/en.wav + wget https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/ja.wav + wget https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/ko.wav + wget https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/yue.wav + wget https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/resolve/main/test_wavs/zh.wav + popd + + ls -lh $d + tar cjvf $d.tar.bz2 $d + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }} + tag: asr-models + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src huggingface + + rm -rf huggingface/* + + cp -av $src/* ./huggingface/ + + cd huggingface + + git status + ls -lh + git lfs track "*.pt*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src main || true + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models huggingface + + cp -av $src.tar.bz2 ./huggingface/non-streaming-asr + + cd huggingface + + git status + ls -lh + git lfs track "*.tar.bz2*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models main || true + diff --git a/.github/workflows/export-silero-vad.yaml b/.github/workflows/export-silero-vad.yaml new file mode 100644 index 000000000..76dbc0346 --- /dev/null +++ b/.github/workflows/export-silero-vad.yaml @@ -0,0 +1,111 @@ +name: export-silero-vad + +on: + push: + branches: + - export-silero-vad + workflow_dispatch: + +concurrency: + group: export-silero-vad-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-silero-vad: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export ${{ matrix.model }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + model: ['v4'] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install pyannote + shell: bash + run: | + pip install torch==1.13.0 torchaudio==0.13.0 soundfile librosa numpy==1.26.4 + + - name: Export ${{ matrix.model }} + shell: bash + run: | + pushd scripts/silero-vad + model=${{ matrix.model }} + ./run-$model.sh + python3 ./export-$model.py + ls -lh + + + - name: Test ${{ matrix.model }} + shell: bash + run: | + pushd scripts/silero-vad + + model=${{ matrix.model }} + python3 ./test-$model.py + ls -lh + + - name: Test ${{ matrix.model }} batch + shell: bash + run: | + pushd scripts/silero-vad + + model=${{ matrix.model }} + python3 ./test-$model-batch.py + ls -lh + + - name: Collect results + shell: bash + run: | + cp scripts/silero-vad/*.pt ./ + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.pt + overwrite: true + repo_name: k2-fsa/sherpa + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }} + tag: vad-models + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + model=${{ matrix.model }} + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models huggingface + + mkdir -p ./huggingface/vad + + cp -av *.pt ./huggingface/vad + + cd huggingface + + git status + ls -lh + git lfs track "*.pt*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models main || true + diff --git a/.github/workflows/export-whisper.yml b/.github/workflows/export-whisper.yml new file mode 100644 index 000000000..224b98ed6 --- /dev/null +++ b/.github/workflows/export-whisper.yml @@ -0,0 +1,168 @@ +name: export-whisper + +on: + push: + branches: + - export-whisper + workflow_dispatch: + +concurrency: + group: export-whisper-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-whisper: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export whisper ${{ matrix.model }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + model: ["turbo", "distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + python3 -m pip install torch==1.13.0 torchaudio==0.13.0 -f https://download.pytorch.org/whl/cpu/torch_stable.html numpy==1.26.4 kaldi-native-fbank + python3 -m pip install -U openai-whisper + python3 -m pip install onnx soundfile librosa + + - name: Download model files for ${{ matrix.model }} + shell: bash + run: | + pushd scripts/whisper + + model=${{ matrix.model }} + echo "model: $model" + if [[ $model == distil-medium.en ]]; then + wget -q -O distil-medium-en-original-model.bin https://huggingface.co/distil-whisper/distil-medium.en/resolve/main/original-model.bin + ls -lh + elif [[ $model == distil-large-v2 ]]; then + wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin + ls -lh + elif [[ $model == distil-small.en ]]; then + wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin + ls -lh + elif [[ $model == medium-aishell ]]; then + wget -q -O medium-aishell.pt https://huggingface.co/yuekai/icefall_asr_aishell_whisper/resolve/main/exp_medium/whisper-medium-aishell1-epoch-10-avg-4.pt + ls -lh + fi + + - name: Export ${{ matrix.model }} + shell: bash + run: | + pushd scripts/whisper + export name=${{ matrix.model }} + ./run.sh + + - name: Test ${{ matrix.model }} + shell: bash + run: | + pushd scripts/whisper + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21/resolve/main/test_wavs/0.wav + ./test.py + + - name: Collect results + shell: bash + run: | + d=sherpa-whisper-${{ matrix.model }} + mkdir $d + mv -v scripts/whisper/model.pt $d/ + mv -v scripts/whisper/tokens.txt $d/ + mv -v scripts/whisper/README.md $d/ + + pushd $d + mkdir test_wavs + cd test_wavs + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21/resolve/main/test_wavs/0.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21/resolve/main/test_wavs/1.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21/resolve/main/test_wavs/8k.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21/resolve/main/test_wavs/trans.txt + popd + + ls -lh $d + tar cjvf $d.tar.bz2 $d + echo "---" + ls -lh + + - name: Release + if: matrix.model != 'large' && matrix.model != 'large-v1' && matrix.model != 'large-v2' && matrix.model != 'large-v3' && matrix.model != 'medium-aishell' + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }} + tag: asr-models + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=sherpa-whisper-${{ matrix.model }} + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src huggingface + + rm -rf huggingface/* + + cp -av $src/* ./huggingface/ + + cd huggingface + + git status + ls -lh + git lfs track "*.pt*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src main || true + + - name: Publish ${{ matrix.model }} to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=sherpa-whisper-${{ matrix.model }} + + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + export GIT_LFS_SKIP_SMUDGE=1 + + rm -rf huggingface + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models huggingface + + cp -av $src.tar.bz2 ./huggingface/non-streaming-asr + + cd huggingface + + git status + ls -lh + git lfs track "*.tar.bz2*" + + git add . + git commit -m "upload $src" || true + git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models main || true + diff --git a/.github/workflows/macos-cpu-wheels.yml b/.github/workflows/macos-cpu-wheels.yml new file mode 100644 index 000000000..ea5f549ed --- /dev/null +++ b/.github/workflows/macos-cpu-wheels.yml @@ -0,0 +1,108 @@ +name: build-wheels-cpu-macos + +on: + push: + tags: + - '*' + workflow_dispatch: + +concurrency: + group: build-wheels-cpu-macos-${{ github.ref }} + cancel-in-progress: true + +jobs: + generate_build_matrix: + # see https://github.com/pytorch/pytorch/pull/50633 + runs-on: macos-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Generating build matrix + id: set-matrix + run: | + # outputting for debugging purposes + python ./scripts/github_actions/generate_build_matrix.py --for-macos + MATRIX=$(python ./scripts/github_actions/generate_build_matrix.py --for-macos) + echo "::set-output name=matrix::${MATRIX}" + + build_wheels_macos_cpu: + needs: generate_build_matrix + name: ${{ matrix.torch }} ${{ matrix.python-version }} + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + # see https://cibuildwheel.readthedocs.io/en/stable/changelog/ + # for a list of versions + - name: Build wheels + shell: bash + run: + pip install -U pip + pip install cmake numpy setuptools wheel + + pip install torch==${{ matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html + + pip install k2==1.24.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + + pip install kaldifeat==1.25.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://csukuangfj.github.io/kaldifeat/cpu.html + + python3 setup.py bdist_wheel + + mkdir -p wheelhouse + + cp -v dist/* wheelhouse + + - name: Display wheels + shell: bash + run: | + ls -lh ./wheelhouse/ + + - name: Upload Wheel + uses: actions/upload-artifact@v4 + with: + name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-macos-latest-cpu + path: wheelhouse/*.whl + + # https://huggingface.co/docs/hub/spaces-github-actions + - name: Publish to huggingface + if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/csukuangfj/sherpa huggingface + cd huggingface + git pull + + mkdir -p macos + cp -v ../wheelhouse/*.whl ./macos + git status + git lfs track "*.whl" + git add . + git commit -m "upload macos wheel for torch ${{ matrix.torch }} python ${{ matrix.python-version }}" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa main diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml index 5194680c9..1a845398b 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish_to_pypi.yml @@ -4,19 +4,31 @@ on: push: tags: - '*' + paths: + - '.github/workflows/publish_to_pypi.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + - 'sherpa/python/**' + workflow_dispatch: + +concurrency: + group: pypi-${{ github.ref }} + cancel-in-progress: true jobs: pypi: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: "3.10" - name: Install Python dependencies shell: bash diff --git a/.github/workflows/python_style_check.yml b/.github/workflows/python_style_check.yml index bd685110d..8aba03011 100644 --- a/.github/workflows/python_style_check.yml +++ b/.github/workflows/python_style_check.yml @@ -20,26 +20,40 @@ on: push: branches: - master + paths: + - '.github/workflows/python_style_check.yml' + - 'sherpa/python/**' + - 'sherpa/bin/**' pull_request: branches: - master + paths: + - '.github/workflows/python_style_check.yml' + - 'sherpa/python/**' + - 'sherpa/bin/**' + + workflow_dispatch: + +concurrency: + group: python_style_check-${{ github.ref }} + cancel-in-progress: true jobs: - style_check: + python_style_check: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-18.04, macos-10.15] - python-version: [3.7, 3.9] + os: [ubuntu-latest] + python-version: ["3.10"] fail-fast: false steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/run-cpp-test.yaml b/.github/workflows/run-cpp-test.yaml index 9311aea3e..b02a43093 100644 --- a/.github/workflows/run-cpp-test.yaml +++ b/.github/workflows/run-cpp-test.yaml @@ -20,31 +20,56 @@ on: push: branches: - master + - cpp-sense-voice + paths: + - '.github/workflows/run-cpp-test.yaml' + - '.github/scripts/run-offline-transducer.sh' + - '.github/scripts/run-online-transducer.sh' + - '.github/scripts/run-offline-ctc.sh' + - '.github/scripts/run-offline-sense-voice.sh' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' pull_request: types: [labeled] + paths: + - '.github/workflows/run-cpp-test.yaml' + - '.github/scripts/run-offline-transducer.sh' + - '.github/scripts/run-online-transducer.sh' + - '.github/scripts/run-offline-ctc.sh' + - '.github/scripts/run-offline-sense-voice.sh' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + workflow_dispatch: + +concurrency: + group: run_cpp_tests-${{ github.ref }} + cancel-in-progress: true jobs: run_cpp_tests: - if: github.event.label.name == 'ready' || github.event.label.name == 'cpp' || github.event_name == 'push' runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest] - torch: ["1.10.0", "1.6.0"] - python-version: ["3.7", "3.8", "3.9"] + os: [ubuntu-latest] + torch: ["2.1.2"] + python-version: ["3.10"] build_type: ["Release", "Debug"] - exclude: - - torch: "1.6.0" - python-version: "3.9" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: ccache + uses: hendrikmuhs/ccache-action@v1.1 + with: + key: ${{ matrix.os }}-${{ matrix.torch }}-${{ matrix.python-version }}-${{ matrix.build_type }} + - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -60,57 +85,39 @@ jobs: - name: Install PyTorch ${{ matrix.torch }} shell: bash - if: startsWith(matrix.os, 'ubuntu') run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions - python3 -m pip install torch==${{ matrix.torch }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html + sudo apt-get -qq install git-lfs tree sox + sox --version - python3 -m pip install k2==1.17.dev20220813+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html + sudo apt-get -qq install -y libsnappy-dev libzzip-dev zlib1g-dev libboost-all-dev - python3 -m torch.utils.collect_env - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io + python3 -m pip install --upgrade pip kaldi_native_io sentencepiece>=0.1.96 python3 -m pip install wheel twine typing_extensions python3 -m pip install torch==${{ matrix.torch }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - python3 -m pip install k2==1.17.dev20220813+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html + python3 -m pip install k2==1.24.4.dev20231220+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + python3 -m pip install kaldifeat==1.25.3.dev20231221+cpu.torch${{ matrix.torch }} -f https://csukuangfj.github.io/kaldifeat/cpu.html python3 -m torch.utils.collect_env - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - name: Build sherpa shell: bash env: BUILD_TYPE: ${{ matrix.build_type }} run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - echo "Build type: $BUILD_TYPE" mkdir build cd build - cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE .. - make -j2 VERBOSE=1 + + cmake \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DSHERPA_ENABLE_TESTS=ON .. + + make -j4 VERBOSE=1 ls -lh lib ls -lh bin @@ -123,10 +130,11 @@ jobs: ldd lib/libsherpa_core.so - ldd bin/sherpa + ldd bin/sherpa-offline readelf -d lib/libsherpa_core.so - readelf -d bin/sherpa + readelf -d bin/sherpa-offline + readelf -d bin/sherpa-online - name: Display dependencies if: startsWith(matrix.os, 'macos') @@ -137,19 +145,22 @@ jobs: otool -L lib/libsherpa_core.dylib otool -l lib/libsherpa_core.dylib - otool -L bin/sherpa - otool -l bin/sherpa + otool -L bin/sherpa-offline + otool -l bin/sherpa-offline + + otool -L bin/sherpa-online + otool -l bin/sherpa-online - name: Test sherpa shell: bash run: | cd build # test_log - SHERPA_LOG_LEVEL=TRACE ./bin/test_log - SHERPA_LOG_LEVEL=DEBUG ./bin/test_log - SHERPA_LOG_LEVEL=INFO ./bin/test_log - SHERPA_LOG_LEVEL=WARNING ./bin/test_log - SHERPA_LOG_LEVEL=ERROR ./bin/test_log + SHERPA_LOG_LEVEL=TRACE ./bin/test-log + SHERPA_LOG_LEVEL=DEBUG ./bin/test-log + SHERPA_LOG_LEVEL=INFO ./bin/test-log + SHERPA_LOG_LEVEL=WARNING ./bin/test-log + SHERPA_LOG_LEVEL=ERROR ./bin/test-log ctest --verbose --output-on-failure -E py # exclude Python tests @@ -160,298 +171,29 @@ jobs: ./bin/sherpa-version - ./bin/sherpa --help + ./bin/sherpa-offline --help - - - name: Download pretrained model and test-data + - name: Run offline whisper shell: bash run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + .github/scripts/run-offline-whisper.sh - - name: Test C++ API + - name: Run offline sense-voice shell: bash run: | - ./build/bin/test_decode_files \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - ./build/bin/test_decode_files \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - ./build/bin/test_decode_samples \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - ./build/bin/test_decode_samples \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - ./build/bin/test_decode_features \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - ./build/bin/test_decode_features \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - - name: Test decoding a single wave - shell: bash - run: | - echo "Test greedy search" - - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --num-active-paths=4 \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav + .github/scripts/run-offline-sense-voice.sh - - name: Test decoding multiple waves + - name: Run online transducer shell: bash run: | - echo "Test greedy search" - - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --num-active-paths=4 \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - - name: Test decoding wav.scp - shell: bash - run: | - .github/scripts/generate_wav_scp.sh - - echo "Test greedy search" - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - --use-wav-scp=true \ - scp:wav.scp \ - ark,scp,t:results.ark,results.scp - - head results.scp results.ark + .github/scripts/run-online-transducer.sh - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - --use-wav-scp=true \ - scp:wav.scp \ - ark,scp,t:results2.ark,results2.scp - - head results2.scp results2.ark - - - name: Test decoding feats.scp + - name: Run offline transducer shell: bash run: | - export PYTHONPATH=$HOME/tmp/kaldifeat/build/lib:$HOME/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - - .github/scripts/generate_feats_scp.py scp:wav.scp ark,scp:feats.ark,feats.scp + .github/scripts/run-offline-transducer.sh - echo "Test greedy search" - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - --use-feats-scp=true \ - scp:feats.scp \ - ark,scp,t:results3.ark,results3.scp - - head results3.scp results3.ark - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - --use-feats-scp=true \ - scp:feats.scp \ - ark,scp,t:results4.ark,results4.scp - - head results4.scp results4.ark - - - name: Download pretrained model and test-data (aishell) + - name: Run offline CTC shell: bash run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 - - - name: Test C++ API (aishell) - shell: bash - run: | - ./build/bin/test_decode_files \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav - - ./build/bin/test_decode_files \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav - - ./build/bin/test_decode_samples \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav - - ./build/bin/test_decode_samples \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav - - ./build/bin/test_decode_features \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav - - ./build/bin/test_decode_features \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav - - - name: Test decoding a single wave (aishell) - shell: bash - run: | - echo "Test greedy search" - - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav - - - name: Test decoding multiple waves (aishell) - shell: bash - run: | - echo "Test greedy search" - - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --num-active-paths=4 \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav - - - name: Test decoding wav.scp (aishell) - shell: bash - run: | - .github/scripts/generate_wav_scp_aishell.sh - - echo "Test greedy search" - - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --use-wav-scp=true \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - scp:wav_aishell.scp \ - ark,scp,t:results-aishell.ark,results-aishell.scp - - head results-aishell.scp results-aishell.ark - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --use-wav-scp=true \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - scp:wav_aishell.scp \ - ark,scp,t:results2-aishell.ark,results2-aishell.scp - - head results2-aishell.scp results2-aishell.ark - - - name: Test decoding feats.scp (aishell) - shell: bash - run: | - export PYTHONPATH=$HOME/tmp/kaldifeat/build/lib:$HOME/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - - .github/scripts/generate_feats_scp.py scp:wav_aishell.scp ark,scp:feats_aishell.ark,feats_aishell.scp - - ./build/bin/sherpa \ - --decoding-method=greedy_search \ - --use-feats-scp=true \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - scp:feats_aishell.scp \ - ark,scp,t:results3-aishell.ark,results3-aishell.scp - - head results3-aishell.scp results3-aishell.ark - - echo "Test modified_beam_search" - - ./build/bin/sherpa \ - --decoding-method=modified_beam_search \ - --use-feats-scp=true \ - --nn-model=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --tokens=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt \ - scp:feats_aishell.scp \ - ark,scp,t:results4-aishell.ark,results4-aishell.scp - - head results4-aishell.scp results4-aishell.ark + .github/scripts/run-offline-ctc.sh diff --git a/.github/workflows/run-cpp-websocket-test.yaml b/.github/workflows/run-cpp-websocket-test.yaml new file mode 100644 index 000000000..3e91d76fb --- /dev/null +++ b/.github/workflows/run-cpp-websocket-test.yaml @@ -0,0 +1,170 @@ +name: Run C++ websocket tests + +on: + push: + branches: + - master + paths: + - '.github/workflows/run-cpp-websocket-test.yaml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + pull_request: + types: [labeled] + paths: + - '.github/workflows/run-cpp-websocket-test.yaml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + workflow_dispatch: + +concurrency: + group: run_cpp_websocket_tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + run_cpp_websocket_tests: + if: github.event.label.name == 'ready' || github.event.label.name == 'websocket' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + torch: ["1.13.1"] + python-version: ["3.10"] + build_type: ["Release", "Debug"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.1 + with: + key: ${{ matrix.os }}-${{ matrix.torch }}-${{ matrix.python-version }}-${{ matrix.build_type }} + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Display gcc version + if: startsWith(matrix.os, 'ubuntu') + run: | + gcc --version + + - name: Display clang version + if: startsWith(matrix.os, 'macos') + run: | + clang --version + + - name: Install PyTorch ${{ matrix.torch }} + shell: bash + if: startsWith(matrix.os, 'ubuntu') + run: | + python3 -m pip install wheel twine typing_extensions + python3 -m pip install torch==${{ matrix.torch }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html + + python3 -m pip install k2==1.24.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + + + python3 -m torch.utils.collect_env + + - name: Install PyTorch ${{ matrix.torch }} + shell: bash + if: startsWith(matrix.os, 'macos') + run: | + python3 -m pip install wheel twine typing_extensions + python3 -m pip install torch==${{ matrix.torch }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html + + python3 -m pip install k2==1.24.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + + python3 -m torch.utils.collect_env + + - name: Cache kaldifeat + id: my-cache-2 + uses: actions/cache@v2 + with: + path: | + ~/tmp/kaldifeat + key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} + + - name: Install kaldifeat + if: steps.my-cache-2.outputs.cache-hit != 'true' + shell: bash + run: | + .github/scripts/install-kaldifeat.sh + + - name: Build sherpa + shell: bash + env: + BUILD_TYPE: ${{ matrix.build_type }} + run: | + export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build + echo $KALDIFEAT_INSTALL_PREFIX + ls -lh $KALDIFEAT_INSTALL_PREFIX + + echo "Build type: $BUILD_TYPE" + + mkdir build + cd build + + cmake \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + -DSHERPA_ENABLE_TESTS=ON \ + -DSHERPA_ENABLE_WEBSOCKET=ON .. + + make -j2 VERBOSE=1 sherpa-offline-websocket-server sherpa-offline-websocket-client + + ls -lh lib + ls -lh bin + + - name: Download pretrained model and test-data + shell: bash + run: | + git lfs install + git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + + - name: Start the server + shell: bash + run: | + ./build/bin/sherpa-offline-websocket-server \ + --use-gpu=false \ + --port=6006 \ + --num-io-threads=1 \ + --num-work-threads=1 \ + --max-batch-size=2 \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ + --decoding-method=greedy_search \ + --doc-root=./sherpa/bin/web \ + --log-file=./log.txt & + + echo "Sleep 10 seconds to wait for the server startup" + sleep 10 + cat ./log.txt + + - name: Start the client + shell: bash + run: | + ./build/bin/sherpa-offline-websocket-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav + + ./build/bin/sherpa-offline-websocket-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ + + ./build/bin/sherpa-offline-websocket-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav + + cat ./log.txt diff --git a/.github/workflows/run-python-test.yaml b/.github/workflows/run-python-test.yaml new file mode 100644 index 000000000..a2167001e --- /dev/null +++ b/.github/workflows/run-python-test.yaml @@ -0,0 +1,120 @@ +name: Run Python tests + +on: + push: + branches: + - master + paths: + - '.github/workflows/run-python-test.yaml' + - '.github/scripts/run-python-test.sh' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + - 'sherpa/python/**' + pull_request: + types: [labeled] + paths: + - '.github/workflows/run-python-test.yaml' + - '.github/scripts/run-python-test.sh' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + - 'sherpa/python/**' + workflow_dispatch: + +concurrency: + group: run_python_tests-${{ github.ref }} + cancel-in-progress: true + +jobs: + run_python_tests: + if: github.event.label.name == 'ready' || github.event.label.name == 'python' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + torch: ["1.13.1"] + torchaudio: ["0.13.1"] + python-version: ["3.10"] + build_type: ["Release"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.1 + with: + key: ${{ matrix.os }}-${{ matrix.torch }}-${{ matrix.python-version }}-${{ matrix.build_type }} + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Display gcc version + run: | + gcc --version + + - name: Install PyTorch ${{ matrix.torch }} + shell: bash + run: | + sudo apt-get -qq install git-lfs tree sox + sox --version + + sudo apt-get install -y libsnappy-dev libzzip-dev zlib1g-dev libboost-all-dev + + python3 -m pip install --upgrade pip kaldi_native_io sentencepiece>=0.1.96 + python3 -m pip install wheel twine typing_extensions pytest + python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html + + python3 -m pip install k2==1.24.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + + python3 -m torch.utils.collect_env + + - name: Cache kaldifeat + id: my-cache-2 + uses: actions/cache@v2 + with: + path: | + ~/tmp/kaldifeat + key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }}-kaldifeat-v1.22 + + - name: Install kaldifeat + if: steps.my-cache-2.outputs.cache-hit != 'true' + shell: bash + run: | + .github/scripts/install-kaldifeat.sh + + - name: Build sherpa + shell: bash + env: + BUILD_TYPE: ${{ matrix.build_type }} + run: | + export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build + echo $KALDIFEAT_INSTALL_PREFIX + ls -lh $KALDIFEAT_INSTALL_PREFIX + + echo "Build type: $BUILD_TYPE" + export SHERPA_CMAKE_ARGS="-DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DSHERPA_ENABLE_TESTS=ON" + + python3 setup.py bdist_wheel + ls -lh dist + pip install ./dist/*.whl + + - name: Display sherpa version + shell: bash + run: | + sherpa-version + + - name: Run Python tests + shell: bash + run: | + export PYTHONPATH=$HOME/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH + export PYTHONPATH=$HOME/tmp/kaldifeat/build/lib:$PYTHONPATH + + .github/scripts/run-python-test.sh diff --git a/.github/workflows/run-streaming-conformer-test.yaml b/.github/workflows/run-streaming-conformer-test.yaml deleted file mode 100644 index 9ff7b923a..000000000 --- a/.github/workflows/run-streaming-conformer-test.yaml +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run streaming conformer ASR tests - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_streaming_conformer_asr_tests: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-18.04, macos-latest] - torch: ["1.10.0", "1.6.0"] - torchaudio: ["0.10.0", "0.6.0"] - python-version: ["3.7", "3.8"] - decoding: ["greedy_search", "fast_beam_search", "fast_beam_search_nbest", "fast_beam_search_nbest_LG"] - exclude: - - torch: "1.10.0" - torchaudio: "0.6.0" - - torch: "1.6.0" - torchaudio: "0.10.0" - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - - python3 -m torch.utils.collect_env - - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/pkufool/icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - ./sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10/exp/cpu_jit-epoch-29-avg-6_torch-${{ matrix.torch }}.pt \ - --decoding-method ${{ matrix.decoding }} \ - --lang-dir ./icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10/data/lang_bpe_500/bpe.model & - - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - ./sherpa/bin/streaming_pruned_transducer_statelessX/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-pruned-stateless-streaming-conformer-rnnt4-2022-06-10/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/run-streaming-conv-emformer-test.yaml b/.github/workflows/run-streaming-conv-emformer-test.yaml deleted file mode 100644 index e4eb9fe59..000000000 --- a/.github/workflows/run-streaming-conv-emformer-test.yaml +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang, -# Zengwei Yao) -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run streaming ConvEmformer ASR tests - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_streaming_conv_emformer_asr_tests: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-18.04, macos-latest] - torch: ["1.10.0", "1.6.0"] - torchaudio: ["0.10.0", "0.6.0"] - python-version: ["3.7", "3.8"] - decoding: ["greedy_search", "fast_beam_search", "fast_beam_search_nbest", "fast_beam_search_nbest_LG"] - exclude: - - torch: "1.10.0" - torchaudio: "0.6.0" - - torch: "1.6.0" - torchaudio: "0.10.0" - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - - python3 -m torch.utils.collect_env - - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - ./sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-${{ matrix.torch }}.pt \ - --decoding-method ${{ matrix.decoding }} \ - --lang-dir ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/bpe.model & - - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - ./sherpa/bin/conv_emformer_transducer_stateless2/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/run-streaming-test-windows-cpu.yaml b/.github/workflows/run-streaming-test-windows-cpu.yaml deleted file mode 100644 index 3216feb0f..000000000 --- a/.github/workflows/run-streaming-test-windows-cpu.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run streaming ASR tests windows cpu - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_streaming_asr_tests_windows_cpu: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-2019] - torch: ["1.10.0"] - torchaudio: ["0.10.0"] - python-version: ["3.7", "3.8", "3.9"] - decoding: ["greedy_search", "modified_beam_search", "fast_beam_search", "fast_beam_search_nbest", "fast_beam_search_nbest_LG"] - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - # see https://github.com/microsoft/setup-msbuild - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Display Python version - run: python -c "import sys; print(sys.version)" - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 soundfile - python3 -m pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Install kaldifeat - shell: bash - run: | - python3 -m pip install --verbose kaldifeat - - - name: Install sherpa - shell: bash - run: | - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py \ - --decoding-method ${{ matrix.decoding }} \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt \ - --lang-dir ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/bpe.model & - - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/run-streaming-test-windows-cuda.yaml b/.github/workflows/run-streaming-test-windows-cuda.yaml deleted file mode 100644 index 727f82c2a..000000000 --- a/.github/workflows/run-streaming-test-windows-cuda.yaml +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run streaming ASR tests windows cuda - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_streaming_asr_tests_windows_cuda: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-2019] - torch: ["1.7.1"] - torchaudio: ["0.7.2"] - cuda: ["10.1.243"] - python-version: ["3.7", "3.8"] - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - # see https://github.com/microsoft/setup-msbuild - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Display Python version - run: python -c "import sys; print(sys.version)" - - # See https://github.com/Jimver/cuda-toolkit/blob/master/src/links/windows-links.ts - # for available CUDA versions - - uses: Jimver/cuda-toolkit@v0.2.7 - id: cuda-toolkit - with: - cuda: ${{ matrix.cuda }} - - - name: Display CUDA version - shell: bash - run: | - echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}" - echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" - nvcc --version - - - name: Remove CUDA installation package - shell: bash - run: | - rm "C:/hostedtoolcache/windows/cuda_installer-windows/${{ matrix.cuda }}/x64/cuda_installer_${{ matrix.cuda }}.exe" - - - name: Download cuDNN - shell: bash - run: | - GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/cudnn-for-windows - cd cudnn-for-windows - git lfs pull --include="cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip" - unzip cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip - rm cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip - ls -lh * - ls -lh */* - - echo "PWD: $PWD" - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 soundfile - python3 -m pip install torch==${{ matrix.torch }}+cu101 torchaudio==${{ matrix.torchaudio }} -f https://download.pytorch.org/whl/torch_stable.html numpy - - # python3 -m pip install k2==1.16.dev20220621+cuda10.1.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -c "import torch; print('torch version:', torch.__version__)" - - python3 -m torch.utils.collect_env - - - - name: Display CMake version - run: | - cmake --version - cmake --help - - - name: Install kaldifeat - shell: bash - run: | - echo "PWD: $PWD" - export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib" - python3 -m pip install --verbose kaldifeat - - - name: Install sherpa - shell: bash - run: | - echo "PWD: $PWD" - export SHERPA_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib" - python3 setup.py install - - python3 -c "import sherpa; print(sherpa.__version__)" - # We only test that it compiles successfully on Windows with CUDA support diff --git a/.github/workflows/run-streaming-test-with-long-waves.yaml b/.github/workflows/run-streaming-test-with-long-waves.yaml deleted file mode 100644 index 42b4cb085..000000000 --- a/.github/workflows/run-streaming-test-with-long-waves.yaml +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run streaming ASR tests with very long waves - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_streaming_asr_tests_with_long_test_waves: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-18.04] - torch: ["1.10.0", "1.6.0"] - torchaudio: ["0.10.0", "0.6.0"] - python-version: ["3.8"] - decoding: ["greedy_search", "modified_beam_search"] - exclude: - - torch: "1.10.0" - torchaudio: "0.6.0" - - torch: "1.6.0" - torchaudio: "0.10.0" - - torch: "1.6.0" - python-version: "3.9" - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 - - - name: Download test data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/sherpa-long-audio-test-data - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py \ - --decoding-method ${{ matrix.decoding }} \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1-torch-${{ matrix.torch }}.pt \ - --lang-dir ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/bpe.model & - - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client (10-minutes.wav) - shell: bash - run: | - ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./sherpa-long-audio-test-data/10-minutes.wav - - # - name: Start client (20-minutes.wav) - # shell: bash - # run: | - # ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py \ - # --server-addr localhost \ - # --server-port 6006 \ - # ./sherpa-long-audio-test-data/20-minutes.wav - # - # - name: Start client (30-minutes.wav) - # shell: bash - # run: | - # ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py \ - # --server-addr localhost \ - # --server-port 6006 \ - # ./sherpa-long-audio-test-data/30-minutes.wav diff --git a/.github/workflows/run-streaming-test.yaml b/.github/workflows/run-streaming-test.yaml deleted file mode 100644 index d1138ff15..000000000 --- a/.github/workflows/run-streaming-test.yaml +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run streaming ASR tests - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_streaming_asr_tests: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-latest] - torch: ["1.10.0", "1.6.0"] - torchaudio: ["0.10.0", "0.6.0"] - python-version: ["3.7", "3.8"] - decoding: ["greedy_search", "modified_beam_search", "fast_beam_search", "fast_beam_search_nbest", "fast_beam_search_nbest_LG"] - exclude: - - torch: "1.10.0" - torchaudio: "0.6.0" - - torch: "1.6.0" - torchaudio: "0.10.0" - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py \ - --decoding-method ${{ matrix.decoding }} \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1-torch-${{ matrix.torch }}.pt \ - --lang-dir ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/bpe.model & - - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1089-134686-0001.wav \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1221-135766-0001.wav \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/run-test-aishell.yaml b/.github/workflows/run-test-aishell.yaml deleted file mode 100644 index e42e45ec8..000000000 --- a/.github/workflows/run-test-aishell.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Test aishell offline asr - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_tests_aishell_offline_asr: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-18.04, macos-latest] - decoding: ["greedy_search", "modified_beam_search"] - torch: ["1.10.0", "1.6.0"] - torchaudio: ["0.10.0", "0.6.0"] - python-version: ["3.7", "3.8", "3.9"] - exclude: - - torch: "1.10.0" - torchaudio: "0.6.0" - - torch: "1.6.0" - torchaudio: "0.10.0" - - torch: "1.6.0" - python-version: "3.9" - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - - python3 -m torch.utils.collect_env - - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --decoding-method ${{ matrix.decoding }} \ - --port 6006 \ - --num-device 0 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --token-filename ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt & - - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - sherpa/bin/pruned_transducer_statelessX/offline_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav diff --git a/.github/workflows/run-test-windows-cpu.yaml b/.github/workflows/run-test-windows-cpu.yaml deleted file mode 100644 index c4ff32a6e..000000000 --- a/.github/workflows/run-test-windows-cpu.yaml +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run tests windows cpu - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_tests_windows_cpu: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-2019] - decoding: ["greedy_search", "modified_beam_search"] - torch: ["1.10.0"] - torchaudio: ["0.10.0"] - python-version: [3.7, 3.8, 3.9] - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - # see https://github.com/microsoft/setup-msbuild - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Display Python version - run: python -c "import sys; print(sys.version)" - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 soundfile - python3 -m pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Install kaldifeat - shell: bash - run: | - python3 -m pip install --verbose kaldifeat - - - name: Install sherpa - shell: bash - run: | - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --decoding-method ${{ matrix.decoding }} \ - --port 6006 \ - --num-device 0 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ - --lang-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model & - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - sherpa/bin/pruned_transducer_statelessX/offline_client.py \ - --server-addr localhost \ - --server-port 6006 \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/run-test-windows-cuda.yaml b/.github/workflows/run-test-windows-cuda.yaml deleted file mode 100644 index cf344ebc7..000000000 --- a/.github/workflows/run-test-windows-cuda.yaml +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run tests windows cuda - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_tests_windows_cuda: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-2019] - torch: ["1.7.1"] - torchaudio: ["0.7.2"] - cuda: ["10.1.243"] - python-version: [3.7, 3.8] - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - # see https://github.com/microsoft/setup-msbuild - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Display Python version - run: python -c "import sys; print(sys.version)" - - # See https://github.com/Jimver/cuda-toolkit/blob/master/src/links/windows-links.ts - # for available CUDA versions - - uses: Jimver/cuda-toolkit@v0.2.7 - id: cuda-toolkit - with: - cuda: ${{ matrix.cuda }} - - - name: Display CUDA version - shell: bash - run: | - echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}" - echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" - nvcc --version - - - name: Remove CUDA installation package - shell: bash - run: | - rm "C:/hostedtoolcache/windows/cuda_installer-windows/${{ matrix.cuda }}/x64/cuda_installer_${{ matrix.cuda }}.exe" - - - name: Download cuDNN - shell: bash - run: | - GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/cudnn-for-windows - cd cudnn-for-windows - git lfs pull --include="cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip" - unzip cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip - rm cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive.zip - ls -lh * - ls -lh */* - - echo "PWD: $PWD" - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 soundfile - python3 -m pip install torch==${{ matrix.torch }}+cu101 torchaudio==${{ matrix.torchaudio }} -f https://download.pytorch.org/whl/torch_stable.html numpy - - python3 -c "import torch; print('torch version:', torch.__version__)" - - # python3 -m pip install k2==1.16.dev20220621+cuda10.1.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - python3 -m torch.utils.collect_env - - - - name: Display CMake version - run: | - cmake --version - cmake --help - - - name: Install kaldifeat - shell: bash - run: | - echo "PWD: $PWD" - export KALDIFEAT_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib" - python3 -m pip install --verbose kaldifeat - - - name: Install sherpa - shell: bash - run: | - echo "PWD: $PWD" - export SHERPA_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCUDNN_INCLUDE_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/include -DCUDNN_LIBRARY_PATH=d:/a/sherpa/sherpa/cudnn-for-windows/cudnn-windows-x86_64-8.4.1.50_cuda11.6-archive/lib/cudnn.lib" - python3 setup.py install - - python3 -c "import sherpa; print(sherpa.__version__)" diff --git a/.github/workflows/run-test.yaml b/.github/workflows/run-test.yaml deleted file mode 100644 index e7a7a2385..000000000 --- a/.github/workflows/run-test.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run tests - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_tests: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-18.04, macos-latest] - decoding: ["greedy_search", "modified_beam_search"] - torch: ["1.10.0", "1.6.0"] - torchaudio: ["0.10.0", "0.6.0"] - python-version: ["3.7", "3.8", "3.9"] - exclude: - - torch: "1.10.0" - torchaudio: "0.6.0" - - torch: "1.6.0" - torchaudio: "0.10.0" - - torch: "1.6.0" - python-version: "3.9" - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - - python3 -m torch.utils.collect_env - - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - - python3 -m pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - - python3 -m torch.utils.collect_env - - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 - - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --decoding-method ${{ matrix.decoding }} \ - --port 6006 \ - --num-device 0 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-${{ matrix.torch }}.pt \ - --lang-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/ \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model & - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - - name: Start client - shell: bash - run: | - sherpa/bin/pruned_transducer_statelessX/offline_client.py \ - --server-addr localhost \ - --server-port 6006 \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/run-wenetspeech-streaming-conformer-rnnt-test.yaml b/.github/workflows/run-wenetspeech-streaming-conformer-rnnt-test.yaml deleted file mode 100644 index 428da6f32..000000000 --- a/.github/workflows/run-wenetspeech-streaming-conformer-rnnt-test.yaml +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Run wenetspeech streaming conformer ASR tests - -on: - push: - branches: - - master - pull_request: - types: [labeled] - -jobs: - run_wenetspeech_streaming_conformer_asr_tests: - if: github.event.label.name == 'ready' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-18.04, macos-latest] - torch: ["1.11.0", "1.7.1"] - torchaudio: ["0.11.0", "0.7.2"] - python-version: ["3.7", "3.8"] - decoding: ["greedy_search", "fast_beam_search", "fast_beam_search_nbest", "fast_beam_search_nbest_LG"] - exclude: - - torch: "1.11.0" - torchaudio: "0.7.2" - - torch: "1.7.1" - torchaudio: "0.11.0" - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install GCC 7 - if: startsWith(matrix.os, 'ubuntu') - run: | - sudo apt-get install -y gcc-7 g++-7 - echo "CC=/usr/bin/gcc-7" >> $GITHUB_ENV - echo "CXX=/usr/bin/g++-7" >> $GITHUB_ENV - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'ubuntu') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }}+cpu numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - if [[ ${{ matrix.torchaudio }} == "0.11.0" ]]; then - pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - else - pip install torchaudio==${{ matrix.torchaudio }} - fi - python3 -m torch.utils.collect_env - - - - name: Install PyTorch ${{ matrix.torch }} - shell: bash - if: startsWith(matrix.os, 'macos') - run: | - python3 -m pip install --upgrade pip kaldi_native_io - python3 -m pip install wheel twine typing_extensions websockets sentencepiece>=0.1.96 - python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html - pip install k2==1.16.dev20220621+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/index.html - python3 -m torch.utils.collect_env - - - name: Cache kaldifeat - id: my-cache-2 - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} - - - name: Install kaldifeat - if: steps.my-cache-2.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - name: Install sherpa - shell: bash - run: | - export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build - echo $KALDIFEAT_INSTALL_PREFIX - ls -lh $KALDIFEAT_INSTALL_PREFIX - - python3 setup.py install - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming - - name: Start server - shell: bash - run: | - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - ./sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp/cpu_jit_epoch_5_avg_1_torch.${{ matrix.torch }}.pt \ - --decoding-method ${{ matrix.decoding }} \ - --lang-dir ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/data/lang_char/ \ - --token-filename ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/data/lang_char/tokens.txt & - echo "Sleep 10 seconds to wait for the server startup" - sleep 10 - - name: Start client - shell: bash - run: | - ./sherpa/bin/streaming_pruned_transducer_statelessX/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/test_wavs/DEV_T0000000000.wav \ - ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/test_wavs/DEV_T0000000001.wav \ - ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/test_wavs/DEV_T0000000002.wav diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index 85c76b044..5e60e94a1 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -14,31 +14,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: style_check +name: cpp_style_check on: push: branches: - master + paths: + - '.github/workflows/style_check.yml' + - 'sherpa/bin/**' + - 'sherpa/python/**' pull_request: branches: - master + paths: + - '.github/workflows/style_check.yml' + - 'sherpa/bin/**' + - 'sherpa/python/**' + workflow_dispatch: + +concurrency: + group: cpp_style_check-${{ github.ref }} + cancel-in-progress: true jobs: - style_check: - runs-on: ubuntu-18.04 + cpp_style_check: + runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: ["3.10"] fail-fast: false steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/test-conda-install.yaml b/.github/workflows/test-conda-install.yaml deleted file mode 100644 index bf9447e01..000000000 --- a/.github/workflows/test-conda-install.yaml +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -name: Test conda install - -on: - push: - branches: - - master - pull_request: - types: [labeled] - - -jobs: - test-conda-install: - if: github.event.label.name == 'ready' || github.event.label.name == 'cpp' || github.event_name == 'push' - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, windows-latest] - torch: ["1.12.0"] - python-version: ["3.8"] - - steps: - - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: sherpa - - - name: Display CONDA_PREFIX - shell: bash -l {0} - run: | - echo "CONDA_PREFIX: $CONDA_PREFIX" - ls -l $CONDA_PREFIX - - - name: Install conda dependencies - shell: bash -l {0} - run: | - conda install -c k2-fsa -c k2-fsa-sherpa -c kaldifeat -c kaldi_native_io -c pytorch k2 sherpa kaldifeat kaldi_native_io pytorch=${{ matrix.torch }} python=${{ matrix.python-version }} - - - name: Display sherpa version - if: startsWith(matrix.os, 'ubuntu') - shell: bash -l {0} - run: | - sherpa-version - - sherpa --help - - - name: Display sherpa version - if: startsWith(matrix.os, 'windows') - shell: cmd - run: | - set path=C:\Miniconda\envs\sherpa\lib\site-packages\sherpa\bin;%path% - set path=C:\Miniconda\envs\sherpa\lib\site-packages\sherpa\lib;%path% - set path=C:\Miniconda\envs\sherpa\lib\site-packages\torch\lib;%path% - set path=C:\Miniconda\envs\sherpa\Library\bin;%path% - - sherpa-version - - sherpa --help - - - name: Download pretrained model and test-data - shell: bash - run: | - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 - - - name: Test decoding a single wave - if: startsWith(matrix.os, 'ubuntu') - shell: bash -l {0} - run: | - export PATH=$CONDA_PREFIX/lib/site-packages/sherpa/bin:$PATH - export PATH=$CONDA_PREFIX/lib/site-packages/sherpa/lib:$PATH - export PATH=$CONDA_PREFIX/lib/site-packages/torch/lib:$PATH - - echo "Test greedy search" - - sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - echo "Test modified_beam_search" - - sherpa \ - --decoding-method=modified_beam_search \ - --num-active-paths=4 \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - - name: Test decoding multiple waves - if: startsWith(matrix.os, 'ubuntu') - shell: bash -l {0} - run: | - export PATH=$CONDA_PREFIX/Library/bin:$PATH - - echo "Test greedy search" - - sherpa \ - --decoding-method=greedy_search \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - echo "Test modified_beam_search" - - sherpa \ - --decoding-method=modified_beam_search \ - --num-active-paths=4 \ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - - name: Test decoding a single wave - if: startsWith(matrix.os, 'windows') - shell: cmd - run: | - set path=C:\Miniconda\envs\sherpa\lib\site-packages\sherpa\bin;%path% - set path=C:\Miniconda\envs\sherpa\lib\site-packages\sherpa\lib;%path% - set path=C:\Miniconda\envs\sherpa\lib\site-packages\torch\lib;%path% - set path=C:\Miniconda\envs\sherpa\Library\bin;%path% - - echo "Test greedy search" - - sherpa ^ - --decoding-method=greedy_search ^ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt ^ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - echo "Test modified_beam_search" - - sherpa ^ - --decoding-method=modified_beam_search ^ - --num-active-paths=4 ^ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt ^ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - - - name: Test decoding multiple waves - if: startsWith(matrix.os, 'windows') - shell: cmd - run: | - set path=C:\Miniconda\envs\sherpa\lib\site-packages\sherpa\bin;%path% - set path=C:\Miniconda\envs\sherpa\lib\site-packages\sherpa\lib;%path% - set path=C:\Miniconda\envs\sherpa\lib\site-packages\torch\lib;%path% - set path=C:\Miniconda\envs\sherpa\Library\bin;%path% - - echo "Test greedy search" - - sherpa ^ - --decoding-method=greedy_search ^ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt ^ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - echo "Test modified_beam_search" - - sherpa ^ - --decoding-method=modified_beam_search ^ - --num-active-paths=4 ^ - --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt ^ - --tokens=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav ^ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav diff --git a/.github/workflows/test-offline-websocket-rtf-wer.yaml b/.github/workflows/test-offline-websocket-rtf-wer.yaml new file mode 100644 index 000000000..24c359b40 --- /dev/null +++ b/.github/workflows/test-offline-websocket-rtf-wer.yaml @@ -0,0 +1,121 @@ +name: Test RTF and WER of C++ offline websocket server + +on: + push: + branches: + - master + paths: + - '.github/workflows/test-offline-websocket-rtf-wer.yaml' + - '.github/scripts/test-offline-websocket-rtf-wer.sh' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + pull_request: + types: [labeled] + paths: + - '.github/workflows/test-offline-websocket-rtf-wer.yaml' + - '.github/scripts/test-offline-websocket-rtf-wer.sh' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + workflow_dispatch: + +concurrency: + group: test_rtf_and_wer_cpp_offline_websocket_server-${{ github.ref }} + cancel-in-progress: true + +jobs: + test_rtf_and_wer_cpp_offline_websocket_server: + if: github.event.label.name == 'rtf' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + torch: ["1.13.1"] + torchaudio: ["0.13.1"] + python-version: ["3.10"] + decoding_method: ["greedy_search", "modified_beam_search"] + num_connections: [50, 100, 200] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get number of CPU cores + uses: SimenB/github-actions-cpu-cores@v1 + id: cpu-cores + + - name: Display number of CPU cores + shell: bash + run: | + echo "Number of CPU cores: ${{ steps.cpu-cores.outputs.count }}" + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Display gcc version + if: startsWith(matrix.os, 'ubuntu') + shell: bash + run: | + gcc --version + + - name: Display clang version + if: startsWith(matrix.os, 'macos') + shell: bash + run: | + clang --version + + - name: Install apt dependencies + if: startsWith(matrix.os, 'ubuntu') + run: | + sudo apt update + sudo apt install libsndfile1-dev libsndfile1 + + - name: Install PyTorch ${{ matrix.torch }} + shell: bash + run: | + python3 -m pip install wheel twine typing_extensions + python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html + + python3 -m pip install k2==1.24.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + + python3 -m torch.utils.collect_env + + - name: Cache kaldifeat + id: my-cache-2 + uses: actions/cache@v2 + with: + path: | + ~/tmp/kaldifeat + key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} + + - name: Install kaldifeat + if: steps.my-cache-2.outputs.cache-hit != 'true' + shell: bash + run: | + .github/scripts/install-kaldifeat.sh + + - name: Run RTF test + shell: bash + env: + DECODING_METHOD: ${{ matrix.decoding_method }} + NUM_CONNECTIONS: ${{ matrix.num_connections }} + run: | + export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build + echo $KALDIFEAT_INSTALL_PREFIX + ls -lh $KALDIFEAT_INSTALL_PREFIX + + .github/scripts/test-offline-websocket-rtf-wer.sh + cat ./log.txt + + - name: Upload decoding results + uses: actions/upload-artifact@v4 + with: + name: os-${{ matrix.os }}-decoding-method-${{ matrix.decoding_method }} + path: ./*.txt diff --git a/.github/workflows/test-online-websocket-rtf-wer.yaml b/.github/workflows/test-online-websocket-rtf-wer.yaml new file mode 100644 index 000000000..bcb7a927d --- /dev/null +++ b/.github/workflows/test-online-websocket-rtf-wer.yaml @@ -0,0 +1,120 @@ +name: Test RTF and WER of C++ online websocket server + +on: + push: + branches: + - master + paths: + - '.github/workflows/test-online-websocket-rtf-wer.yaml' + - '.github/scripts/test-online-websocket-rtf-wer.sh' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + pull_request: + types: [labeled] + paths: + - '.github/workflows/test-online-websocket-rtf-wer.yaml' + - '.github/scripts/test-online-websocket-rtf-wer.sh' + - 'CMakeLists.txt' + - 'cmake/**' + - 'sherpa/csrc/**' + - 'sherpa/cpp_api/**' + workflow_dispatch: + +concurrency: + group: test_rtf_and_wer_cpp_online_websocket_server-${{ github.ref }} + cancel-in-progress: true + +jobs: + test_rtf_and_wer_cpp_online_websocket_server: + if: github.event.label.name == 'rtf' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + torch: ["1.13.1"] + torchaudio: ["0.13.1"] + python-version: ["3.10"] + decoding_method: ["greedy_search", "modified_beam_search"] + num_connections: [50, 100, 200] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get number of CPU cores + uses: SimenB/github-actions-cpu-cores@v1 + id: cpu-cores + + - name: Display number of CPU cores + shell: bash + run: | + echo "Number of CPU cores: ${{ steps.cpu-cores.outputs.count }}" + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Display gcc version + if: startsWith(matrix.os, 'ubuntu') + shell: bash + run: | + gcc --version + + - name: Display clang version + if: startsWith(matrix.os, 'macos') + shell: bash + run: | + clang --version + + - name: Install apt dependencies + if: startsWith(matrix.os, 'ubuntu') + run: | + sudo apt update + sudo apt install libsndfile1-dev libsndfile1 + + - name: Install PyTorch ${{ matrix.torch }} + shell: bash + run: | + python3 -m pip install wheel twine typing_extensions + python3 -m pip install torch==${{ matrix.torch }} torchaudio==${{ matrix.torchaudio }} numpy -f https://download.pytorch.org/whl/cpu/torch_stable.html + + python3 -m pip install k2==1.24.4.dev20240223+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html + python3 -m torch.utils.collect_env + + - name: Cache kaldifeat + id: my-cache-2 + uses: actions/cache@v2 + with: + path: | + ~/tmp/kaldifeat + key: cache-tmp-${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.torch }} + + - name: Install kaldifeat + if: steps.my-cache-2.outputs.cache-hit != 'true' + shell: bash + run: | + .github/scripts/install-kaldifeat.sh + + - name: Run RTF test + shell: bash + env: + DECODING_METHOD: ${{ matrix.decoding_method }} + NUM_CONNECTIONS: ${{ matrix.num_connections }} + run: | + export KALDIFEAT_INSTALL_PREFIX=$HOME/tmp/kaldifeat/build + echo $KALDIFEAT_INSTALL_PREFIX + ls -lh $KALDIFEAT_INSTALL_PREFIX + + .github/scripts/test-online-websocket-rtf-wer.sh + cat ./log.txt + + - name: Upload decoding results + uses: actions/upload-artifact@v4 + with: + name: os-${{ matrix.os }}-decoding-method-${{ matrix.decoding_method }} + path: ./*.txt diff --git a/.github/workflows/ubuntu-cpu-wheels.yml b/.github/workflows/ubuntu-cpu-wheels.yml new file mode 100644 index 000000000..c1eaccea2 --- /dev/null +++ b/.github/workflows/ubuntu-cpu-wheels.yml @@ -0,0 +1,123 @@ +name: build-wheels-cpu-ubuntu + +on: + push: + tags: + - '*' + workflow_dispatch: + +concurrency: + group: build-wheels-cpu-ubuntu-${{ github.ref }} + cancel-in-progress: true + +jobs: + generate_build_matrix: + # see https://github.com/pytorch/pytorch/pull/50633 + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Generating build matrix + id: set-matrix + run: | + # outputting for debugging purposes + python ./scripts/github_actions/generate_build_matrix.py + MATRIX=$(python ./scripts/github_actions/generate_build_matrix.py) + echo "::set-output name=matrix::${MATRIX}" + + build-manylinux-wheels: + needs: generate_build_matrix + name: ${{ matrix.torch }} ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Display Python version + run: python -c "import sys; print(sys.version)" + + - name: Install patchelf + shell: bash + run: | + sudo apt-get update -q + sudo apt-get install -q -y patchelf + patchelf --help + + # see https://github.com/pytorch/test-infra/blob/9e3d392690719fac85bad0c9b67f530e48375ca1/tools/scripts/generate_binary_build_matrix.py + # https://github.com/pytorch/builder/tree/main/manywheel + # https://github.com/pytorch/builder/pull/476 + # https://github.com/k2-fsa/k2/issues/733 + # https://github.com/pytorch/pytorch/pull/50633 (generate build matrix) + - name: Run the build process with Docker + uses: addnab/docker-run-action@v3 + with: + image: ${{ matrix.image }} + options: -v ${{ github.workspace }}:/var/www -e PYTHON_VERSION=${{ matrix.python-version }} -e TORCH_VERSION=${{ matrix.torch }} + run: | + echo "pwd: $PWD" + uname -a + id + cat /etc/*release + gcc --version + python3 --version + which python3 + + /var/www/scripts/github_actions/build-ubuntu-cpu.sh + + - name: Patch wheels + shell: bash + run: | + ls -lh + ls -lh ./wheels/ + sudo ./scripts/github_actions/patch_wheel.py --in-dir ./wheels --out-dir ./wheelhouse + + ls -lh ./wheelhouse/ + + - name: Upload Wheel + uses: actions/upload-artifact@v4 + with: + name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu + path: wheelhouse/*.whl + + # https://huggingface.co/docs/hub/spaces-github-actions + - name: Publish to huggingface + if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/csukuangfj/sherpa huggingface + cd huggingface + git pull + + mkdir -p ubuntu-cpu + cp -v ../wheelhouse/*.whl ./ubuntu-cpu + git status + git lfs track "*.whl" + git add . + git commit -m "upload ubuntu-cpu wheel for torch ${{ matrix.torch }} python ${{ matrix.python-version }}" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa main diff --git a/.github/workflows/ubuntu-cuda-wheels.yml b/.github/workflows/ubuntu-cuda-wheels.yml new file mode 100644 index 000000000..0e8469eca --- /dev/null +++ b/.github/workflows/ubuntu-cuda-wheels.yml @@ -0,0 +1,157 @@ +name: build-wheels-cuda-ubuntu + +on: + push: + tags: + - '*' + workflow_dispatch: + +concurrency: + group: build-wheels-cuda-ubuntu-${{ github.ref }} + cancel-in-progress: true + +jobs: + generate_build_matrix: + # see https://github.com/pytorch/pytorch/pull/50633 + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Generating build matrix + id: set-matrix + run: | + # outputting for debugging purposes + python ./scripts/github_actions/generate_build_matrix.py --enable-cuda + MATRIX=$(python ./scripts/github_actions/generate_build_matrix.py --enable-cuda) + echo "::set-output name=matrix::${MATRIX}" + + build-manylinux-wheels: + needs: generate_build_matrix + name: ${{ matrix.torch }} ${{ matrix.python-version }} cuda${{ matrix.cuda }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Free space + shell: bash + run: | + df -h + rm -rf /opt/hostedtoolcache + df -h + echo "pwd: $PWD" + echo "github.workspace ${{ github.workspace }}" + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Display Python version + run: python -c "import sys; print(sys.version)" + + - name: Install patchelf + shell: bash + run: | + id + ls -lh + sudo apt-get update -q + sudo apt-get install -q -y patchelf + patchelf --help + + # see https://github.com/pytorch/test-infra/blob/9e3d392690719fac85bad0c9b67f530e48375ca1/tools/scripts/generate_binary_build_matrix.py + # https://github.com/pytorch/builder/tree/main/manywheel + # https://github.com/pytorch/builder/pull/476 + # https://github.com/k2-fsa/k2/issues/733 + # https://github.com/pytorch/pytorch/pull/50633 (generate build matrix) + - name: Run the build process with Docker + uses: addnab/docker-run-action@v3 + with: + image: ${{ matrix.image }} + options: -v ${{ github.workspace }}:/var/www -e PYTHON_VERSION=${{ matrix.python-version }} -e TORCH_VERSION=${{ matrix.torch }} -e CUDA_VERSION=${{ matrix.cuda }} + run: | + echo "pwd: $PWD" + uname -a + id + cat /etc/*release + gcc --version + python3 --version + which python3 + + pushd /usr/local + rm cuda + ln -s cuda-$CUDA_VERSION cuda + popd + which nvcc + nvcc --version + + find /usr/local -name libcuda.so* + echo "libnvrtc.so*" + + find /usr/local -name libnvrtc.so* + + pushd /usr/local/cuda-$CUDA_VERSION/targets/x86_64-linux/lib/stubs + ln -s libcuda.so libcuda.so.1 + popd + export LD_LIBRARY_PATH=/usr/local/cuda-$CUDA_VERSION/targets/x86_64-linux/lib/stubs:$LD_LIBRARY_PATH + + export LD_LIBRARY_PATH=/usr/local/cuda-$CUDA_VERSION/targets/x86_64-linux/lib:$LD_LIBRARY_PATH + + cp /var/www/scripts/github_actions/install_torch.sh . + chmod +x install_torch.sh + + /var/www/scripts/github_actions/build-ubuntu-cuda.sh + + - name: Patch wheels + shell: bash + run: | + id + ls -lh + ls -lh ./wheels/ + sudo ./scripts/github_actions/patch_wheel.py --in-dir ./wheels --out-dir ./wheelhouse + + ls -lh ./wheelhouse/ + + - name: Upload Wheel + uses: actions/upload-artifact@v4 + with: + name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cuda + path: wheelhouse/*.whl + + # https://huggingface.co/docs/hub/spaces-github-actions + - name: Publish to huggingface + if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/csukuangfj/sherpa huggingface + cd huggingface + git pull + + mkdir -p ubuntu-cuda + cp -v ../wheelhouse/*.whl ./ubuntu-cuda + git status + git lfs track "*.whl" + git add . + git commit -m "upload ubuntu-cuda wheel for torch ${{ matrix.torch }} python ${{ matrix.python-version }}" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa main diff --git a/.github/workflows/windows-x64-cpu-wheels.yml b/.github/workflows/windows-x64-cpu-wheels.yml new file mode 100644 index 000000000..e647f1187 --- /dev/null +++ b/.github/workflows/windows-x64-cpu-wheels.yml @@ -0,0 +1,95 @@ +name: build-wheels-cpu-win64 + +on: + push: + branches: + - release + tags: + - '*' + workflow_dispatch: + +concurrency: + group: build-wheels-cpu-win64-${{ github.ref }} + cancel-in-progress: true + +jobs: + generate_build_matrix: + # see https://github.com/pytorch/pytorch/pull/50633 + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Generating build matrix + id: set-matrix + run: | + # outputting for debugging purposes + python ./scripts/github_actions/generate_build_matrix.py --for-windows + MATRIX=$(python ./scripts/github_actions/generate_build_matrix.py --for-windows) + echo "::set-output name=matrix::${MATRIX}" + + build_wheels_win64_cpu: + needs: generate_build_matrix + name: ${{ matrix.torch }} ${{ matrix.python-version }} + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + # see https://cibuildwheel.readthedocs.io/en/stable/changelog/ + # for a list of versions + - name: Build wheels + uses: pypa/cibuildwheel@v2.11.4 + env: + CIBW_BEFORE_BUILD: pip install torch==${{ matrix.torch}}+cpu cmake numpy k2==1.24.3.dev20230726+cpu.torch${{ matrix.torch }} kaldifeat==1.25.0.dev20230726+cpu.torch${{ matrix.torch }} -f https://csukuangfj.github.io/kaldifeat/cpu.html -f https://k2-fsa.github.io/k2/cpu.html -f https://download.pytorch.org/whl/torch_stable.html + CIBW_BUILD: ${{ matrix.python-version }}-win_amd64 + CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "" + CIBW_BUILD_VERBOSITY: 3 + + - name: Display wheels + shell: bash + run: | + ls -lh ./wheelhouse/ + + - name: Upload Wheel + uses: actions/upload-artifact@v4 + with: + name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-windows-latest-cpu + path: wheelhouse/*.whl + + # https://huggingface.co/docs/hub/spaces-github-actions + - name: Publish to huggingface + if: github.repository_owner == 'csukuangfj' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v2 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/csukuangfj/sherpa huggingface + cd huggingface + git pull + + mkdir -p windows-cpu + cp -v ../wheelhouse/*.whl ./windows-cpu + git status + git lfs track "*.whl" + git add . + git commit -m "upload windows-cpu wheel for torch ${{ matrix.torch }} python ${{ matrix.python-version }}" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa main diff --git a/.gitignore b/.gitignore index 209d75236..57687d8bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.DS_Store path.sh build dist @@ -6,3 +7,23 @@ __pycache__ torch_version.py *.ark *.scp +*.onnx +log/ +icefall-* +icefall_ +recogs-* +errs-* +rtf-* +*.wav +Testing +run-offline-ctc*.sh +run-offline-asr*.sh +sherpa-nemo-ctc* +/.cache +/.idea +/.vscode +*.pt +tokens.txt +*.bin +sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 +sherpa-whisper-tiny.en diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b1700c7ed..ec5a24adc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: args: [--max-line-length=80] - repo: https://github.com/pycqa/isort - rev: 5.9.2 + rev: 5.13.2 hooks: - id: isort args: [--profile=black, --line-length=80] diff --git a/CMakeLists.txt b/CMakeLists.txt index 6756b741c..4f3fc5f4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,9 @@ -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa) # Remember to change # scripts/conda-cpu/sherpa/meta.yaml -set(SHERPA_VERSION "0.8") +set(SHERPA_VERSION "1.3") set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") @@ -16,28 +16,39 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) if(NOT APPLE) set(SHERPA_RPATH_ORIGIN "$ORIGIN") else() + set(CMAKE_MACOSX_RPATH ON) set(SHERPA_RPATH_ORIGIN "@loader_path") endif() set(CMAKE_INSTALL_RPATH ${SHERPA_RPATH_ORIGIN}) set(CMAKE_BUILD_RPATH ${SHERPA_RPATH_ORIGIN}) -set(BUILD_SHARED_LIBS ON) -if(WIN32) - message(STATUS "Set BUILD_SHARED_LIBS to OFF for Windows") - set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) -endif() - if(NOT CMAKE_BUILD_TYPE) message(STATUS "No CMAKE_BUILD_TYPE given, default to Release") set(CMAKE_BUILD_TYPE Release) endif() message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") -set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ version to be used.") +endif() + set(CMAKE_CXX_EXTENSIONS OFF) -option(SHERPA_ENABLE_TESTS "Whether to build tests" ON) +option(SHERPA_ENABLE_TESTS "Whether to build tests" OFF) +option(SHERPA_ENABLE_PORTAUDIO "Whether to build with portaudio" ON) +option(SHERPA_ENABLE_WEBSOCKET "Whether to build with websocket" ON) +option(SHERPA_ENABLE_GRPC "Whether to build with grpc" OFF) +option(BUILD_SHARED_LIBS "Whether to build shared libraries" ON) + +message(STATUS "SHERPA_ENABLE_TESTS: ${SHERPA_ENABLE_TESTS}") +message(STATUS "SHERPA_ENABLE_PORTAUDIO: ${SHERPA_ENABLE_PORTAUDIO}") +message(STATUS "SHERPA_ENABLE_WEBSOCKET: ${SHERPA_ENABLE_WEBSOCKET}") +message(STATUS "SHERPA_ENABLE_GRPC: ${SHERPA_ENABLE_GRPC}") + +if(BUILD_SHARED_LIBS AND MSVC) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() include(CheckIncludeFileCXX) check_include_file_cxx(cxxabi.h SHERPA_HAVE_CXXABI_H) @@ -87,6 +98,10 @@ string(REGEX REPLACE "^\"+|\"+$" "" SHERPA_OS "${SHERPA_OS}") message(STATUS "SHERPA_OS: ${SHERPA_OS}") message(STATUS "CMake version: ${CMAKE_VERSION}") +if(WIN32) + add_definitions(-DNOMINMAX) # Otherwise, std::max() and std::min() won't work +endif() + find_package(Git REQUIRED) execute_process(COMMAND "${GIT_EXECUTABLE}" describe --always --abbrev=40 @@ -116,11 +131,36 @@ message(STATUS "SHERPA_GIT_BRANCH: ${SHERPA_GIT_BRANCH}") list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) +set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -Wall ") +if(NOT WIN32) + set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -g ") +endif() + include(pybind11) +include(json) +# to prevent cmake from trying to link with system installed mkl since we not directly use it +# mkl libraries should be linked with pytorch already +# ref: https://github.com/pytorch/pytorch/blob/master/cmake/public/mkl.cmake +set(CMAKE_DISABLE_FIND_PACKAGE_MKL TRUE) include(torch) include(k2) include(kaldifeat) include(kaldi_native_io) +if(SHERPA_ENABLE_PORTAUDIO) + include(portaudio) +endif() + +if(SHERPA_ENABLE_WEBSOCKET OR SHERPA_ENABLE_GRPC) + include(asio) +endif() + +if(SHERPA_ENABLE_WEBSOCKET) + include(websocketpp) +endif() + +if(SHERPA_ENABLE_GRPC) + include(grpc) +endif() if(SHERPA_ENABLE_TESTS) enable_testing() diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..ef49d016c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +from nvcr.io/nvidia/pytorch:22.12-py3 +# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +# Please choose previous pytorch:xx.xx if you encounter cuda driver mismatch issue +RUN pip3 install torchaudio +RUN pip3 install \ + k2==1.24.4.dev20240223+cuda11.7.torch1.13.1 -f https://k2-fsa.github.io/k2/cuda.html +# #install k2 from source +# #"sed -i ..." line tries to turn off the cuda check +# RUN git clone https://github.com/k2-fsa/k2.git && \ +# cd k2 && \ +# sed -i 's/FATAL_ERROR/STATUS/g' cmake/torch.cmake && \ +# sed -i 's/in running_cuda_version//g' get_version.py && \ +# python3 setup.py install && \ +# cd - +WORKDIR /workspace + +RUN git clone https://github.com/k2-fsa/icefall.git +ENV PYTHONPATH "${PYTHONPATH}:/workspace/icefall" +# https://github.com/k2-fsa/icefall/issues/674 +ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION "python" + +RUN git clone https://github.com/k2-fsa/sherpa.git && \ + cd sherpa && \ + pip3 install -r ./requirements.txt && \ + python3 setup.py bdist_wheel && \ + pip3 install ./dist/k2_sherpa-*.whl --force + + diff --git a/LICENSE b/LICENSE index ee06cfc77..d64569567 100644 --- a/LICENSE +++ b/LICENSE @@ -1,13 +1,4 @@ - Legal Notices - - NOTE (this is not from the Apache License): The copyright model is that - authors (or their employers, if noted in individual files) own their - individual contributions. The authors' contributions can be discerned - from the git history. - - ------------------------------------------------------------------------- - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ diff --git a/README.md b/README.md index 910fabfc9..0d57b0dfa 100644 --- a/README.md +++ b/README.md @@ -1,359 +1,30 @@ -
- - - - -
+# sherpa -[![Documentation Status](https://github.com/k2-fsa/sherpa/actions/workflows/build-doc.yml/badge.svg)](https://k2-fsa.github.io/sherpa/) -**Documentation**: +`sherpa` is an open-source speech-text-text inference framework using +PyTorch, focusing **exclusively** on end-to-end (E2E) models, +namely transducer- and CTC-based models. It provides both C++ and Python APIs. -Try `sherpa` from within your browser without installing anything: - - -See for more details. - -## Introduction - -An ASR server framework in **Python**, supporting both streaming -and non-streaming recognition. - -CPU-bound tasks, such as neural network computation, are implemented in -C++; while IO-bound tasks, such as socket communication, are implemented -in Python. - -**Caution**: For offline ASR, we assume the model is trained using pruned -stateless RNN-T from [icefall][icefall] and it is from a directory like -`pruned_transducer_statelessX` where `X` >=2. For streaming ASR, we -assume the model is using `pruned_stateless_emformer_rnnt2`. - -For the offline ASR, we provide a Colab notebook, containing how to start the -server, how to start the client, and how to decode `test-clean` of LibriSpeech. - -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JX5Ph2onYm1ZjNP_94eGqZ-DIRMLlIca?usp=sharing) - -For the streaming ASR, we provide a YouTube demo, showing you how to use it. -See - -## Installation - -Please refer to - -for installation. - -## Usage - -First, check that `sherpa` has been installed successfully: - -```bash -python3 -c "import sherpa; print(sherpa.__version__)" -``` - -It should print the version of `sherpa`. - -Visit - -to see more tutorials of `sherpa`. - -#### Streaming ASR with pruned stateless Emformer RNN-T - -#### Start the server - -To start the server, you need to first generate two files: - -- (1) The torch script model file. You can use `export.py --jit=1` in -`pruned_stateless_emformer_rnnt2` from [icefall][icefall]. - -- (2) The BPE model file. You can find it in `data/lang_bpe_XXX/bpe.model` -in [icefall][icefall], where `XXX` is the number of BPE tokens used in -the training. - -With the above two files ready, you can start the server with the -following command: - -```bash -./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --max-active-connections 500 \ - --nn-pool-size 1 \ - --nn-model-filename ./path/to/exp/cpu_jit.pt \ - --bpe-model-filename ./path/to/data/lang_bpe_500/bpe.model -``` - -You can use `./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py --help` -to view the help message. - -**Hint**: You can use the environment variable `CUDA_VISIBLE_DEVICES` to control -which GPU is used. For instance, to use GPU 3 in the server, just set -`export CUDA_VISIBLE_DEVICES="3"` before starting the server. - -**Note**: To keep the server from OOM error, please tune `--max-batch-size` -and `--max-active-connections`. - -We provide a pretrained model using the LibriSpeech dataset at - - -The following shows how to use the above pretrained model to start the server. - -```bash -git lfs install -git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 - -./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py \ - --port 6006 \ - --max-batch-size 50 \ - --max-wait-ms 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/bpe.model -``` -Here, before running the web client, you need to map your server ports to your local ports in the server terminal firstly with the following command: -``` -ssh -R 6006:localhost:6006 -R 6008:localhost:6008 your_local_username@your_local_ip -``` -**Note**: -(1) You only need to do this if the asr server is running on a machine different from the client. -(2) The command is run in the terminal on the server machine. -#### Start the client - -We provide two clients at present: - - - (1) [./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py](./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py) - It shows how to decode a single sound file. - - - (2) [./sherpa/bin/pruned_stateless_emformer_rnnt2/web](./sherpa/bin/pruned_stateless_emformer_rnnt2/web) - You can record your speech in real-time within a browser and send it to the server for recognition. - -##### streaming_client.py - -```bash -./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py --help - -./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1221-135766-0001.wav -``` - -##### Web client - -```bash -cd ./sherpa/bin/web -python3 -m http.server 6008 -``` - -Then open your browser and go to `http://localhost:6008/record.html`. You will -see a UI like the following screenshot. - -![web client screenshot](./docs/source/_static/emformer-streaming-asr-web-client.png) - -Click the button `Record`. +This project focuses on deployment, i.e., using pre-trained models to +transcribe speech. If you are interested in how to train or fine-tune your own +models, please refer to [icefall][icefall]. -Now you can `speak` and you will get recognition results from the -server in real-time. +We also have other **similar** projects that don't depend on PyTorch: -**Caution**: For the web client, we hard-code the server port to `6006`. -You can change the file [./sherpa/bin/web/record.js](./sherpa/bin/web/record.js) -to replace `6006` in it to whatever port the server is using. + - [sherpa-onnx][sherpa-onnx] + - [sherpa-ncnn][sherpa-ncnn] -**Caution**: `http://0.0.0.0:6008/record.html` or `http://127.0.0.1:6008/record.html` -won't work. You have to use `localhost`. Otherwise, you won't be able to use -your microphone in your browser since we are not using `https` which requires -a certificate. +> `sherpa-onnx` and `sherpa-ncnn` also support iOS, Android and embedded systems. -### Offline ASR +## Installation and Usage -#### Start the server +Please refer to the **documentation** at -To start the server, you need to first generate two files: +## Try it in your browser -- (1) The torch script model file. You can use `export.py --jit=1` in -`pruned_transducer_statelessX` from [icefall][icefall]. - -- (2) The BPE model file. You can find it in `data/lang_bpe_XXX/bpe.model` -in [icefall][icefall], where `XXX` is the number of BPE tokens used in -the training. If you use a dataset like aishell to train your model where -the modeling unit is Chinese characters, you need to provide a `tokens.txt` -file which can be found in `data/lang_char/tokens.txt` in [icefall][icefall]. - -With the above two files ready, you can start the server with the -following command: - -```bash -# If you provide a bpe.model, e.g., for LibriSpeech, -# you can use the following command: -# -sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --port 6006 \ - --num-device 1 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --max-active-connections 500 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./path/to/exp/cpu_jit.pt \ - --bpe-model-filename ./path/to/data/lang_bpe_500/bpe.model -``` - -```bash -# If you provide a tokens.txt, e.g., for aishell, -# you can use the following command: -# -sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --port 6006 \ - --num-device 1 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --max-active-connections 500 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./path/to/exp/cpu_jit.pt \ - --token-filename ./path/to/data/lang_char/tokens.txt -``` - -You can use `./sherpa/bin/pruned_transducer_statelessX/offline_server.py --help` to view the help message. - -**HINT**: If you don't have GPU, please set `--num-device` to `0`. - -**Caution**: To keep the server from out-of-memory error, you can tune -`--max-batch-size` and `--max-active-connections`. - -We provide pretrained models for the following two datasets: - -- (1) LibriSpeech: - It uses a BPE model with vocabulary size 500. - -- (2) aishell: - It uses Chinese characters as models units. The vocabulary size is 4336. - -The following shows how to use the above pretrained models to start the server. - -- **Use the pretrained model trained with the Librispeech dataset** - -```bash -git lfs install -git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 - -sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --port 6006 \ - --num-device 1 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --max-active-connections 500 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ - --bpe-model-filename ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model -``` - -- **For the pretrained model trained with the aishell dataset** - -```bash -git lfs install -git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 - -sherpa/bin/pruned_transducer_statelessX/offline_server.py \ - --port 6006 \ - --num-device 1 \ - --max-batch-size 10 \ - --max-wait-ms 5 \ - --max-active-connections 500 \ - --feature-extractor-pool-size 5 \ - --nn-pool-size 1 \ - --nn-model-filename ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt \ - --token-filename ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt -``` - -#### Start the client -After starting the server, you can use the following command to start the client: - -```bash -./sherpa/bin/pruned_transducer_statelessX/offline_client.py \ - --server-addr localhost \ - --server-port 6006 \ - /path/to/foo.wav \ - /path/to/bar.wav -``` - -You can use `./sherpa/bin/pruned_transducer_statelessX/offline_client.py --help` to view the usage message. - -The following shows how to use the client to send some test waves to the server -for recognition. - -```bash -# If you use the pretrained model from the LibriSpeech dataset -sherpa/bin/pruned_transducer_statelessX/offline_client.py \ - --server-addr localhost \ - --server-port 6006 \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13//test_wavs/1089-134686-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13//test_wavs/1221-135766-0001.wav \ - icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13//test_wavs/1221-135766-0002.wav -``` - -```bash -# If you use the pretrained model from the aishell dataset -sherpa/bin/pruned_transducer_statelessX/offline_client.py \ - --server-addr localhost \ - --server-port 6006 \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav \ - ./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav -``` - -#### RTF test - -We provide a demo [./sherpa/bin/pruned_transducer_statelessX/decode_manifest.py](./sherpa/bin/pruned_transducer_statelessX/decode_manifest.py) -to decode the `test-clean` dataset from the LibriSpeech corpus. - -It creates 50 connections to the server using websockets and sends audio files -to the server for recognition. - -At the end, it will display the RTF and the WER. - -To give you an idea of the performance of the pretrained model, -the Colab notebook -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1JX5Ph2onYm1ZjNP_94eGqZ-DIRMLlIca?usp=sharing) -shows the following results: - -``` -RTF: 0.0094 -total_duration: 19452.481 seconds (5.40 hours) -processing time: 183.305 seconds (0.05 hours) -%WER = 2.06 - -Errors: 112 insertions, 93 deletions, 876 substitutions, over 52576 reference words (51607 correct) -``` - -If you have a GPU with a larger RAM (e.g., 32 GB), you can get an even **lower** RTF. - -[icefall]: https://github.com/k2-fsa/icefall/ - - -### Contributing - -Contributions to `sherpa` are very welcomed. There are many possible ways to make contributions -and two of them are: -- To write documentation -- To write code: - - To follow the code style in the repository - - To write a new features (support new architectures, new beam search, etc) - -### Follow the code style - -We use the following tools to make the code style to be as consistent as possible: - - - [black](https://github.com/psf/black), to format the code - - [flake8](https://github.com/PyCQA/flake8), to check the style and quality of the code - - [isort](https://github.com/PyCQA/isort), to sort ``imports`` - -After running the following commands: - - $ git clone https://github.com/k2-fsa/sherpa - $ cd sherpa - $ pip install pre-commit - $ pre-commit install +Try `sherpa` from within your browser without installing anything: + -it will run the checks whenever you run ``git commit`` **automatically** +[icefall]: https://github.com/k2-fsa/icefall +[sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx +[sherpa-ncnn]: https://github.com/k2-fsa/sherpa-ncnn diff --git a/sherpa/bin/pruned_transducer_statelessX/__init__.py b/__init__.py similarity index 100% rename from sherpa/bin/pruned_transducer_statelessX/__init__.py rename to __init__.py diff --git a/cmake/Modules/FetchContent.cmake b/cmake/Modules/FetchContent.cmake deleted file mode 100644 index 98cdf6cb9..000000000 --- a/cmake/Modules/FetchContent.cmake +++ /dev/null @@ -1,916 +0,0 @@ -# Distributed under the OSI-approved BSD 3-Clause License. See accompanying -# file Copyright.txt or https://cmake.org/licensing for details. - -#[=======================================================================[.rst: -FetchContent ------------------- - -.. only:: html - - .. contents:: - -Overview -^^^^^^^^ - -This module enables populating content at configure time via any method -supported by the :module:`ExternalProject` module. Whereas -:command:`ExternalProject_Add` downloads at build time, the -``FetchContent`` module makes content available immediately, allowing the -configure step to use the content in commands like :command:`add_subdirectory`, -:command:`include` or :command:`file` operations. - -Content population details would normally be defined separately from the -command that performs the actual population. Projects should also -check whether the content has already been populated somewhere else in the -project hierarchy. Typical usage would look something like this: - -.. code-block:: cmake - - FetchContent_Declare( - googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-1.8.0 - ) - - FetchContent_GetProperties(googletest) - if(NOT googletest_POPULATED) - FetchContent_Populate(googletest) - add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR}) - endif() - -When using the above pattern with a hierarchical project arrangement, -projects at higher levels in the hierarchy are able to define or override -the population details of content specified anywhere lower in the project -hierarchy. The ability to detect whether content has already been -populated ensures that even if multiple child projects want certain content -to be available, the first one to populate it wins. The other child project -can simply make use of the already available content instead of repeating -the population for itself. See the -:ref:`Examples ` section which demonstrates -this scenario. - -The ``FetchContent`` module also supports defining and populating -content in a single call, with no check for whether the content has been -populated elsewhere in the project already. This is a more low level -operation and would not normally be the way the module is used, but it is -sometimes useful as part of implementing some higher level feature or to -populate some content in CMake's script mode. - - -Declaring Content Details -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. command:: FetchContent_Declare - - .. code-block:: cmake - - FetchContent_Declare( ...) - - The ``FetchContent_Declare()`` function records the options that describe - how to populate the specified content, but if such details have already - been recorded earlier in this project (regardless of where in the project - hierarchy), this and all later calls for the same content ```` are - ignored. This "first to record, wins" approach is what allows hierarchical - projects to have parent projects override content details of child projects. - - The content ```` can be any string without spaces, but good practice - would be to use only letters, numbers and underscores. The name will be - treated case-insensitively and it should be obvious for the content it - represents, often being the name of the child project or the value given - to its top level :command:`project` command (if it is a CMake project). - For well-known public projects, the name should generally be the official - name of the project. Choosing an unusual name makes it unlikely that other - projects needing that same content will use the same name, leading to - the content being populated multiple times. - - The ```` can be any of the download or update/patch options - that the :command:`ExternalProject_Add` command understands. The configure, - build, install and test steps are explicitly disabled and therefore options - related to them will be ignored. In most cases, ```` will - just be a couple of options defining the download method and method-specific - details like a commit tag or archive hash. For example: - - .. code-block:: cmake - - FetchContent_Declare( - googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-1.8.0 - ) - - FetchContent_Declare( - myCompanyIcons - URL https://intranet.mycompany.com/assets/iconset_1.12.tar.gz - URL_HASH 5588a7b18261c20068beabfb4f530b87 - ) - - FetchContent_Declare( - myCompanyCertificates - SVN_REPOSITORY svn+ssh://svn.mycompany.com/srv/svn/trunk/certs - SVN_REVISION -r12345 - ) - -Populating The Content -^^^^^^^^^^^^^^^^^^^^^^ - -.. command:: FetchContent_Populate - - .. code-block:: cmake - - FetchContent_Populate( ) - - In most cases, the only argument given to ``FetchContent_Populate()`` is the - ````. When used this way, the command assumes the content details have - been recorded by an earlier call to :command:`FetchContent_Declare`. The - details are stored in a global property, so they are unaffected by things - like variable or directory scope. Therefore, it doesn't matter where in the - project the details were previously declared, as long as they have been - declared before the call to ``FetchContent_Populate()``. Those saved details - are then used to construct a call to :command:`ExternalProject_Add` in a - private sub-build to perform the content population immediately. The - implementation of ``ExternalProject_Add()`` ensures that if the content has - already been populated in a previous CMake run, that content will be reused - rather than repopulating them again. For the common case where population - involves downloading content, the cost of the download is only paid once. - - An internal global property records when a particular content population - request has been processed. If ``FetchContent_Populate()`` is called more - than once for the same content name within a configure run, the second call - will halt with an error. Projects can and should check whether content - population has already been processed with the - :command:`FetchContent_GetProperties` command before calling - ``FetchContent_Populate()``. - - ``FetchContent_Populate()`` will set three variables in the scope of the - caller; ``_POPULATED``, ``_SOURCE_DIR`` and - ``_BINARY_DIR``, where ```` is the lowercased ````. - ``_POPULATED`` will always be set to ``True`` by the call. - ``_SOURCE_DIR`` is the location where the - content can be found upon return (it will have already been populated), while - ``_BINARY_DIR`` is a directory intended for use as a corresponding - build directory. The main use case for the two directory variables is to - call :command:`add_subdirectory` immediately after population, i.e.: - - .. code-block:: cmake - - FetchContent_Populate(FooBar ...) - add_subdirectory(${foobar_SOURCE_DIR} ${foobar_BINARY_DIR}) - - The values of the three variables can also be retrieved from anywhere in the - project hierarchy using the :command:`FetchContent_GetProperties` command. - - A number of cache variables influence the behavior of all content population - performed using details saved from a :command:`FetchContent_Declare` call: - - ``FETCHCONTENT_BASE_DIR`` - In most cases, the saved details do not specify any options relating to the - directories to use for the internal sub-build, final source and build areas. - It is generally best to leave these decisions up to the ``FetchContent`` - module to handle on the project's behalf. The ``FETCHCONTENT_BASE_DIR`` - cache variable controls the point under which all content population - directories are collected, but in most cases developers would not need to - change this. The default location is ``${CMAKE_BINARY_DIR}/_deps``, but if - developers change this value, they should aim to keep the path short and - just below the top level of the build tree to avoid running into path - length problems on Windows. - - ``FETCHCONTENT_QUIET`` - The logging output during population can be quite verbose, making the - configure stage quite noisy. This cache option (``ON`` by default) hides - all population output unless an error is encountered. If experiencing - problems with hung downloads, temporarily switching this option off may - help diagnose which content population is causing the issue. - - ``FETCHCONTENT_FULLY_DISCONNECTED`` - When this option is enabled, no attempt is made to download or update - any content. It is assumed that all content has already been populated in - a previous run or the source directories have been pointed at existing - contents the developer has provided manually (using options described - further below). When the developer knows that no changes have been made to - any content details, turning this option ``ON`` can significantly speed up - the configure stage. It is ``OFF`` by default. - - ``FETCHCONTENT_UPDATES_DISCONNECTED`` - This is a less severe download/update control compared to - ``FETCHCONTENT_FULLY_DISCONNECTED``. Instead of bypassing all download and - update logic, the ``FETCHCONTENT_UPDATES_DISCONNECTED`` only disables the - update stage. Therefore, if content has not been downloaded previously, - it will still be downloaded when this option is enabled. This can speed up - the configure stage, but not as much as - ``FETCHCONTENT_FULLY_DISCONNECTED``. It is ``OFF`` by default. - - In addition to the above cache variables, the following cache variables are - also defined for each content name (```` is the uppercased value of - ````): - - ``FETCHCONTENT_SOURCE_DIR_`` - If this is set, no download or update steps are performed for the specified - content and the ``_SOURCE_DIR`` variable returned to the caller is - pointed at this location. This gives developers a way to have a separate - checkout of the content that they can modify freely without interference - from the build. The build simply uses that existing source, but it still - defines ``_BINARY_DIR`` to point inside its own build area. - Developers are strongly encouraged to use this mechanism rather than - editing the sources populated in the default location, as changes to - sources in the default location can be lost when content population details - are changed by the project. - - ``FETCHCONTENT_UPDATES_DISCONNECTED_`` - This is the per-content equivalent of - ``FETCHCONTENT_UPDATES_DISCONNECTED``. If the global option or this option - is ``ON``, then updates will be disabled for the named content. - Disabling updates for individual content can be useful for content whose - details rarely change, while still leaving other frequently changing - content with updates enabled. - - - The ``FetchContent_Populate()`` command also supports a syntax allowing the - content details to be specified directly rather than using any saved - details. This is more low-level and use of this form is generally to be - avoided in favour of using saved content details as outlined above. - Nevertheless, in certain situations it can be useful to invoke the content - population as an isolated operation (typically as part of implementing some - other higher level feature or when using CMake in script mode): - - .. code-block:: cmake - - FetchContent_Populate( - [QUIET] - [SUBBUILD_DIR ] - [SOURCE_DIR ] - [BINARY_DIR ] - ... - ) - - This form has a number of key differences to that where only ```` is - provided: - - - All required population details are assumed to have been provided directly - in the call to ``FetchContent_Populate()``. Any saved details for - ```` are ignored. - - No check is made for whether content for ```` has already been - populated. - - No global property is set to record that the population has occurred. - - No global properties record the source or binary directories used for the - populated content. - - The ``FETCHCONTENT_FULLY_DISCONNECTED`` and - ``FETCHCONTENT_UPDATES_DISCONNECTED`` cache variables are ignored. - - The ``_SOURCE_DIR`` and ``_BINARY_DIR`` variables are still - returned to the caller, but since these locations are not stored as global - properties when this form is used, they are only available to the calling - scope and below rather than the entire project hierarchy. No - ``_POPULATED`` variable is set in the caller's scope with this form. - - The supported options for ``FetchContent_Populate()`` are the same as those - for :command:`FetchContent_Declare()`. Those few options shown just - above are either specific to ``FetchContent_Populate()`` or their behavior is - slightly modified from how :command:`ExternalProject_Add` treats them. - - ``QUIET`` - The ``QUIET`` option can be given to hide the output associated with - populating the specified content. If the population fails, the output will - be shown regardless of whether this option was given or not so that the - cause of the failure can be diagnosed. The global ``FETCHCONTENT_QUIET`` - cache variable has no effect on ``FetchContent_Populate()`` calls where the - content details are provided directly. - - ``SUBBUILD_DIR`` - The ``SUBBUILD_DIR`` argument can be provided to change the location of the - sub-build created to perform the population. The default value is - ``${CMAKE_CURRENT_BINARY_DIR}/-subbuild`` and it would be unusual - to need to override this default. If a relative path is specified, it will - be interpreted as relative to :variable:`CMAKE_CURRENT_BINARY_DIR`. - - ``SOURCE_DIR``, ``BINARY_DIR`` - The ``SOURCE_DIR`` and ``BINARY_DIR`` arguments are supported by - :command:`ExternalProject_Add`, but different default values are used by - ``FetchContent_Populate()``. ``SOURCE_DIR`` defaults to - ``${CMAKE_CURRENT_BINARY_DIR}/-src`` and ``BINARY_DIR`` defaults to - ``${CMAKE_CURRENT_BINARY_DIR}/-build``. If a relative path is - specified, it will be interpreted as relative to - :variable:`CMAKE_CURRENT_BINARY_DIR`. - - In addition to the above explicit options, any other unrecognized options are - passed through unmodified to :command:`ExternalProject_Add` to perform the - download, patch and update steps. The following options are explicitly - prohibited (they are disabled by the ``FetchContent_Populate()`` command): - - - ``CONFIGURE_COMMAND`` - - ``BUILD_COMMAND`` - - ``INSTALL_COMMAND`` - - ``TEST_COMMAND`` - - If using ``FetchContent_Populate()`` within CMake's script mode, be aware - that the implementation sets up a sub-build which therefore requires a CMake - generator and build tool to be available. If these cannot be found by - default, then the :variable:`CMAKE_GENERATOR` and/or - :variable:`CMAKE_MAKE_PROGRAM` variables will need to be set appropriately - on the command line invoking the script. - - -Retrieve Population Properties -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. command:: FetchContent_GetProperties - - When using saved content details, a call to :command:`FetchContent_Populate` - records information in global properties which can be queried at any time. - This information includes the source and binary directories associated with - the content and also whether or not the content population has been processed - during the current configure run. - - .. code-block:: cmake - - FetchContent_GetProperties( - [SOURCE_DIR ] - [BINARY_DIR ] - [POPULATED ] - ) - - The ``SOURCE_DIR``, ``BINARY_DIR`` and ``POPULATED`` options can be used to - specify which properties should be retrieved. Each option accepts a value - which is the name of the variable in which to store that property. Most of - the time though, only ```` is given, in which case the call will then - set the same variables as a call to - :command:`FetchContent_Populate(name) `. This allows - the following canonical pattern to be used, which ensures that the relevant - variables will always be defined regardless of whether or not the population - has been performed elsewhere in the project already: - - .. code-block:: cmake - - FetchContent_GetProperties(foobar) - if(NOT foobar_POPULATED) - FetchContent_Populate(foobar) - - # Set any custom variables, etc. here, then - # populate the content as part of this build - - add_subdirectory(${foobar_SOURCE_DIR} ${foobar_BINARY_DIR}) - endif() - - The above pattern allows other parts of the overall project hierarchy to - re-use the same content and ensure that it is only populated once. - - -.. _`fetch-content-examples`: - -Examples -^^^^^^^^ - -Consider a project hierarchy where ``projA`` is the top level project and it -depends on projects ``projB`` and ``projC``. Both ``projB`` and ``projC`` -can be built standalone and they also both depend on another project -``projD``. For simplicity, this example will assume that all four projects -are available on a company git server. The ``CMakeLists.txt`` of each project -might have sections like the following: - -*projA*: - -.. code-block:: cmake - - include(FetchContent) - FetchContent_Declare( - projB - GIT_REPOSITORY git@mycompany.com/git/projB.git - GIT_TAG 4a89dc7e24ff212a7b5167bef7ab079d - ) - FetchContent_Declare( - projC - GIT_REPOSITORY git@mycompany.com/git/projC.git - GIT_TAG 4ad4016bd1d8d5412d135cf8ceea1bb9 - ) - FetchContent_Declare( - projD - GIT_REPOSITORY git@mycompany.com/git/projD.git - GIT_TAG origin/integrationBranch - ) - - FetchContent_GetProperties(projB) - if(NOT projb_POPULATED) - FetchContent_Populate(projB) - add_subdirectory(${projb_SOURCE_DIR} ${projb_BINARY_DIR}) - endif() - - FetchContent_GetProperties(projC) - if(NOT projc_POPULATED) - FetchContent_Populate(projC) - add_subdirectory(${projc_SOURCE_DIR} ${projc_BINARY_DIR}) - endif() - -*projB*: - -.. code-block:: cmake - - include(FetchContent) - FetchContent_Declare( - projD - GIT_REPOSITORY git@mycompany.com/git/projD.git - GIT_TAG 20b415f9034bbd2a2e8216e9a5c9e632 - ) - - FetchContent_GetProperties(projD) - if(NOT projd_POPULATED) - FetchContent_Populate(projD) - add_subdirectory(${projd_SOURCE_DIR} ${projd_BINARY_DIR}) - endif() - - -*projC*: - -.. code-block:: cmake - - include(FetchContent) - FetchContent_Declare( - projD - GIT_REPOSITORY git@mycompany.com/git/projD.git - GIT_TAG 7d9a17ad2c962aa13e2fbb8043fb6b8a - ) - - FetchContent_GetProperties(projD) - if(NOT projd_POPULATED) - FetchContent_Populate(projD) - add_subdirectory(${projd_SOURCE_DIR} ${projd_BINARY_DIR}) - endif() - -A few key points should be noted in the above: - -- ``projB`` and ``projC`` define different content details for ``projD``, - but ``projA`` also defines a set of content details for ``projD`` and - because ``projA`` will define them first, the details from ``projB`` and - ``projC`` will not be used. The override details defined by ``projA`` - are not required to match either of those from ``projB`` or ``projC``, but - it is up to the higher level project to ensure that the details it does - define still make sense for the child projects. -- While ``projA`` defined content details for ``projD``, it did not need - to explicitly call ``FetchContent_Populate(projD)`` itself. Instead, it - leaves that to a child project to do (in this case it will be ``projB`` - since it is added to the build ahead of ``projC``). If ``projA`` needed to - customize how the ``projD`` content was brought into the build as well - (e.g. define some CMake variables before calling - :command:`add_subdirectory` after populating), it would do the call to - ``FetchContent_Populate()``, etc. just as it did for the ``projB`` and - ``projC`` content. For higher level projects, it is usually enough to - just define the override content details and leave the actual population - to the child projects. This saves repeating the same thing at each level - of the project hierarchy unnecessarily. -- Even though ``projA`` is the top level project in this example, it still - checks whether ``projB`` and ``projC`` have already been populated before - going ahead to do those populations. This makes ``projA`` able to be more - easily incorporated as a child of some other higher level project in the - future if required. Always protect a call to - :command:`FetchContent_Populate` with a check to - :command:`FetchContent_GetProperties`, even in what may be considered a top - level project at the time. - - -The following example demonstrates how one might download and unpack a -firmware tarball using CMake's :manual:`script mode `. The call to -:command:`FetchContent_Populate` specifies all the content details and the -unpacked firmware will be placed in a ``firmware`` directory below the -current working directory. - -*getFirmware.cmake*: - -.. code-block:: cmake - - # NOTE: Intended to be run in script mode with cmake -P - include(FetchContent) - FetchContent_Populate( - firmware - URL https://mycompany.com/assets/firmware-1.23-arm.tar.gz - URL_HASH MD5=68247684da89b608d466253762b0ff11 - SOURCE_DIR firmware - ) - -#]=======================================================================] - - -set(__FetchContent_privateDir "${CMAKE_CURRENT_LIST_DIR}/FetchContent") - -#======================================================================= -# Recording and retrieving content details for later population -#======================================================================= - -# Internal use, projects must not call this directly. It is -# intended for use by FetchContent_Declare() only. -# -# Sets a content-specific global property (not meant for use -# outside of functions defined here in this file) which can later -# be retrieved using __FetchContent_getSavedDetails() with just the -# same content name. If there is already a value stored in the -# property, it is left unchanged and this call has no effect. -# This allows parent projects to define the content details, -# overriding anything a child project may try to set (properties -# are not cached between runs, so the first thing to set it in a -# build will be in control). -function(__FetchContent_declareDetails contentName) - - string(TOLOWER ${contentName} contentNameLower) - set(propertyName "_FetchContent_${contentNameLower}_savedDetails") - get_property(alreadyDefined GLOBAL PROPERTY ${propertyName} DEFINED) - if(NOT alreadyDefined) - define_property(GLOBAL PROPERTY ${propertyName} - BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" - FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" - ) - set_property(GLOBAL PROPERTY ${propertyName} ${ARGN}) - endif() - -endfunction() - - -# Internal use, projects must not call this directly. It is -# intended for use by the FetchContent_Declare() function. -# -# Retrieves details saved for the specified content in an -# earlier call to __FetchContent_declareDetails(). -function(__FetchContent_getSavedDetails contentName outVar) - - string(TOLOWER ${contentName} contentNameLower) - set(propertyName "_FetchContent_${contentNameLower}_savedDetails") - get_property(alreadyDefined GLOBAL PROPERTY ${propertyName} DEFINED) - if(NOT alreadyDefined) - message(FATAL_ERROR "No content details recorded for ${contentName}") - endif() - get_property(propertyValue GLOBAL PROPERTY ${propertyName}) - set(${outVar} "${propertyValue}" PARENT_SCOPE) - -endfunction() - - -# Saves population details of the content, sets defaults for the -# SOURCE_DIR and BUILD_DIR. -function(FetchContent_Declare contentName) - - set(options "") - set(oneValueArgs SVN_REPOSITORY) - set(multiValueArgs "") - - cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - unset(srcDirSuffix) - unset(svnRepoArgs) - if(ARG_SVN_REPOSITORY) - # Add a hash of the svn repository URL to the source dir. This works - # around the problem where if the URL changes, the download would - # fail because it tries to checkout/update rather than switch the - # old URL to the new one. We limit the hash to the first 7 characters - # so that the source path doesn't get overly long (which can be a - # problem on windows due to path length limits). - string(SHA1 urlSHA ${ARG_SVN_REPOSITORY}) - string(SUBSTRING ${urlSHA} 0 7 urlSHA) - set(srcDirSuffix "-${urlSHA}") - set(svnRepoArgs SVN_REPOSITORY ${ARG_SVN_REPOSITORY}) - endif() - - string(TOLOWER ${contentName} contentNameLower) - __FetchContent_declareDetails( - ${contentNameLower} - SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-src${srcDirSuffix}" - BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build" - ${svnRepoArgs} - # List these last so they can override things we set above - ${ARG_UNPARSED_ARGUMENTS} - ) - -endfunction() - - -#======================================================================= -# Set/get whether the specified content has been populated yet. -# The setter also records the source and binary dirs used. -#======================================================================= - -# Internal use, projects must not call this directly. It is -# intended for use by the FetchContent_Populate() function to -# record when FetchContent_Populate() is called for a particular -# content name. -function(__FetchContent_setPopulated contentName sourceDir binaryDir) - - string(TOLOWER ${contentName} contentNameLower) - set(prefix "_FetchContent_${contentNameLower}") - - set(propertyName "${prefix}_sourceDir") - define_property(GLOBAL PROPERTY ${propertyName} - BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" - FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" - ) - set_property(GLOBAL PROPERTY ${propertyName} ${sourceDir}) - - set(propertyName "${prefix}_binaryDir") - define_property(GLOBAL PROPERTY ${propertyName} - BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" - FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" - ) - set_property(GLOBAL PROPERTY ${propertyName} ${binaryDir}) - - set(propertyName "${prefix}_populated") - define_property(GLOBAL PROPERTY ${propertyName} - BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" - FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" - ) - set_property(GLOBAL PROPERTY ${propertyName} True) - -endfunction() - - -# Set variables in the calling scope for any of the retrievable -# properties. If no specific properties are requested, variables -# will be set for all retrievable properties. -# -# This function is intended to also be used by projects as the canonical -# way to detect whether they should call FetchContent_Populate() -# and pull the populated source into the build with add_subdirectory(), -# if they are using the populated content in that way. -function(FetchContent_GetProperties contentName) - - string(TOLOWER ${contentName} contentNameLower) - - set(options "") - set(oneValueArgs SOURCE_DIR BINARY_DIR POPULATED) - set(multiValueArgs "") - - cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if(NOT ARG_SOURCE_DIR AND - NOT ARG_BINARY_DIR AND - NOT ARG_POPULATED) - # No specific properties requested, provide them all - set(ARG_SOURCE_DIR ${contentNameLower}_SOURCE_DIR) - set(ARG_BINARY_DIR ${contentNameLower}_BINARY_DIR) - set(ARG_POPULATED ${contentNameLower}_POPULATED) - endif() - - set(prefix "_FetchContent_${contentNameLower}") - - if(ARG_SOURCE_DIR) - set(propertyName "${prefix}_sourceDir") - get_property(value GLOBAL PROPERTY ${propertyName}) - if(value) - set(${ARG_SOURCE_DIR} ${value} PARENT_SCOPE) - endif() - endif() - - if(ARG_BINARY_DIR) - set(propertyName "${prefix}_binaryDir") - get_property(value GLOBAL PROPERTY ${propertyName}) - if(value) - set(${ARG_BINARY_DIR} ${value} PARENT_SCOPE) - endif() - endif() - - if(ARG_POPULATED) - set(propertyName "${prefix}_populated") - get_property(value GLOBAL PROPERTY ${propertyName} DEFINED) - set(${ARG_POPULATED} ${value} PARENT_SCOPE) - endif() - -endfunction() - - -#======================================================================= -# Performing the population -#======================================================================= - -# The value of contentName will always have been lowercased by the caller. -# All other arguments are assumed to be options that are understood by -# ExternalProject_Add(), except for QUIET and SUBBUILD_DIR. -function(__FetchContent_directPopulate contentName) - - set(options - QUIET - ) - set(oneValueArgs - SUBBUILD_DIR - SOURCE_DIR - BINARY_DIR - # Prevent the following from being passed through - CONFIGURE_COMMAND - BUILD_COMMAND - INSTALL_COMMAND - TEST_COMMAND - ) - set(multiValueArgs "") - - cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if(NOT ARG_SUBBUILD_DIR) - message(FATAL_ERROR "Internal error: SUBBUILD_DIR not set") - elseif(NOT IS_ABSOLUTE "${ARG_SUBBUILD_DIR}") - set(ARG_SUBBUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ARG_SUBBUILD_DIR}") - endif() - - if(NOT ARG_SOURCE_DIR) - message(FATAL_ERROR "Internal error: SOURCE_DIR not set") - elseif(NOT IS_ABSOLUTE "${ARG_SOURCE_DIR}") - set(ARG_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ARG_SOURCE_DIR}") - endif() - - if(NOT ARG_BINARY_DIR) - message(FATAL_ERROR "Internal error: BINARY_DIR not set") - elseif(NOT IS_ABSOLUTE "${ARG_BINARY_DIR}") - set(ARG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/${ARG_BINARY_DIR}") - endif() - - # Ensure the caller can know where to find the source and build directories - # with some convenient variables. Doing this here ensures the caller sees - # the correct result in the case where the default values are overridden by - # the content details set by the project. - set(${contentName}_SOURCE_DIR "${ARG_SOURCE_DIR}" PARENT_SCOPE) - set(${contentName}_BINARY_DIR "${ARG_BINARY_DIR}" PARENT_SCOPE) - - # The unparsed arguments may contain spaces, so build up ARG_EXTRA - # in such a way that it correctly substitutes into the generated - # CMakeLists.txt file with each argument quoted. - unset(ARG_EXTRA) - foreach(arg IN LISTS ARG_UNPARSED_ARGUMENTS) - set(ARG_EXTRA "${ARG_EXTRA} \"${arg}\"") - endforeach() - - # Hide output if requested, but save it to a variable in case there's an - # error so we can show the output upon failure. When not quiet, don't - # capture the output to a variable because the user may want to see the - # output as it happens (e.g. progress during long downloads). Combine both - # stdout and stderr in the one capture variable so the output stays in order. - if (ARG_QUIET) - set(outputOptions - OUTPUT_VARIABLE capturedOutput - ERROR_VARIABLE capturedOutput - ) - else() - set(capturedOutput) - set(outputOptions) - message(STATUS "Populating ${contentName}") - endif() - - if(CMAKE_GENERATOR) - set(generatorOpts "-G${CMAKE_GENERATOR}") - if(CMAKE_GENERATOR_PLATFORM) - list(APPEND generatorOpts "-A${CMAKE_GENERATOR_PLATFORM}") - endif() - if(CMAKE_GENERATOR_TOOLSET) - list(APPEND generatorOpts "-T${CMAKE_GENERATOR_TOOLSET}") - endif() - - if(CMAKE_MAKE_PROGRAM) - list(APPEND generatorOpts "-DCMAKE_MAKE_PROGRAM:FILEPATH=${CMAKE_MAKE_PROGRAM}") - endif() - - else() - # Likely we've been invoked via CMake's script mode where no - # generator is set (and hence CMAKE_MAKE_PROGRAM could not be - # trusted even if provided). We will have to rely on being - # able to find the default generator and build tool. - unset(generatorOpts) - endif() - - # Create and build a separate CMake project to carry out the population. - # If we've already previously done these steps, they will not cause - # anything to be updated, so extra rebuilds of the project won't occur. - # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project - # has this set to something not findable on the PATH. - configure_file("${__FetchContent_privateDir}/CMakeLists.cmake.in" - "${ARG_SUBBUILD_DIR}/CMakeLists.txt") - execute_process( - COMMAND ${CMAKE_COMMAND} ${generatorOpts} . - RESULT_VARIABLE result - ${outputOptions} - WORKING_DIRECTORY "${ARG_SUBBUILD_DIR}" - ) - if(result) - if(capturedOutput) - message("${capturedOutput}") - endif() - message(FATAL_ERROR "CMake step for ${contentName} failed: ${result}") - endif() - execute_process( - COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - ${outputOptions} - WORKING_DIRECTORY "${ARG_SUBBUILD_DIR}" - ) - if(result) - if(capturedOutput) - message("${capturedOutput}") - endif() - message(FATAL_ERROR "Build step for ${contentName} failed: ${result}") - endif() - -endfunction() - - -option(FETCHCONTENT_FULLY_DISCONNECTED "Disables all attempts to download or update content and assumes source dirs already exist") -option(FETCHCONTENT_UPDATES_DISCONNECTED "Enables UPDATE_DISCONNECTED behavior for all content population") -option(FETCHCONTENT_QUIET "Enables QUIET option for all content population" ON) -set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/_deps" CACHE PATH "Directory under which to collect all populated content") - -# Populate the specified content using details stored from -# an earlier call to FetchContent_Declare(). -function(FetchContent_Populate contentName) - - if(NOT contentName) - message(FATAL_ERROR "Empty contentName not allowed for FetchContent_Populate()") - endif() - - string(TOLOWER ${contentName} contentNameLower) - - if(ARGN) - # This is the direct population form with details fully specified - # as part of the call, so we already have everything we need - __FetchContent_directPopulate( - ${contentNameLower} - SUBBUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/${contentNameLower}-subbuild" - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/${contentNameLower}-src" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/${contentNameLower}-build" - ${ARGN} # Could override any of the above ..._DIR variables - ) - - # Pass source and binary dir variables back to the caller - set(${contentNameLower}_SOURCE_DIR "${${contentNameLower}_SOURCE_DIR}" PARENT_SCOPE) - set(${contentNameLower}_BINARY_DIR "${${contentNameLower}_BINARY_DIR}" PARENT_SCOPE) - - # Don't set global properties, or record that we did this population, since - # this was a direct call outside of the normal declared details form. - # We only want to save values in the global properties for content that - # honours the hierarchical details mechanism so that projects are not - # robbed of the ability to override details set in nested projects. - return() - endif() - - # No details provided, so assume they were saved from an earlier call - # to FetchContent_Declare(). Do a check that we haven't already - # populated this content before in case the caller forgot to check. - FetchContent_GetProperties(${contentName}) - if(${contentNameLower}_POPULATED) - message(FATAL_ERROR "Content ${contentName} already populated in ${${contentNameLower}_SOURCE_DIR}") - endif() - - string(TOUPPER ${contentName} contentNameUpper) - set(FETCHCONTENT_SOURCE_DIR_${contentNameUpper} - "${FETCHCONTENT_SOURCE_DIR_${contentNameUpper}}" - CACHE PATH "When not empty, overrides where to find pre-populated content for ${contentName}") - - if(FETCHCONTENT_SOURCE_DIR_${contentNameUpper}) - # The source directory has been explicitly provided in the cache, - # so no population is required - set(${contentNameLower}_SOURCE_DIR "${FETCHCONTENT_SOURCE_DIR_${contentNameUpper}}") - set(${contentNameLower}_BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build") - - elseif(FETCHCONTENT_FULLY_DISCONNECTED) - # Bypass population and assume source is already there from a previous run - set(${contentNameLower}_SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-src") - set(${contentNameLower}_BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build") - - else() - # Support both a global "disconnect all updates" and a per-content - # update test (either one being set disables updates for this content). - option(FETCHCONTENT_UPDATES_DISCONNECTED_${contentNameUpper} - "Enables UPDATE_DISCONNECTED behavior just for population of ${contentName}") - if(FETCHCONTENT_UPDATES_DISCONNECTED OR - FETCHCONTENT_UPDATES_DISCONNECTED_${contentNameUpper}) - set(disconnectUpdates True) - else() - set(disconnectUpdates False) - endif() - - if(FETCHCONTENT_QUIET) - set(quietFlag QUIET) - else() - unset(quietFlag) - endif() - - __FetchContent_getSavedDetails(${contentName} contentDetails) - if("${contentDetails}" STREQUAL "") - message(FATAL_ERROR "No details have been set for content: ${contentName}") - endif() - - __FetchContent_directPopulate( - ${contentNameLower} - ${quietFlag} - UPDATE_DISCONNECTED ${disconnectUpdates} - SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-subbuild" - SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-src" - BINARY_DIR "${FETCHCONTENT_BASE_DIR}/${contentNameLower}-build" - # Put the saved details last so they can override any of the - # the options we set above (this can include SOURCE_DIR or - # BUILD_DIR) - ${contentDetails} - ) - endif() - - __FetchContent_setPopulated( - ${contentName} - ${${contentNameLower}_SOURCE_DIR} - ${${contentNameLower}_BINARY_DIR} - ) - - # Pass variables back to the caller. The variables passed back here - # must match what FetchContent_GetProperties() sets when it is called - # with just the content name. - set(${contentNameLower}_SOURCE_DIR "${${contentNameLower}_SOURCE_DIR}" PARENT_SCOPE) - set(${contentNameLower}_BINARY_DIR "${${contentNameLower}_BINARY_DIR}" PARENT_SCOPE) - set(${contentNameLower}_POPULATED True PARENT_SCOPE) - -endfunction() diff --git a/cmake/Modules/FetchContent/CMakeLists.cmake.in b/cmake/Modules/FetchContent/CMakeLists.cmake.in deleted file mode 100644 index 9a7a7715a..000000000 --- a/cmake/Modules/FetchContent/CMakeLists.cmake.in +++ /dev/null @@ -1,21 +0,0 @@ -# Distributed under the OSI-approved BSD 3-Clause License. See accompanying -# file Copyright.txt or https://cmake.org/licensing for details. - -cmake_minimum_required(VERSION ${CMAKE_VERSION}) - -# We name the project and the target for the ExternalProject_Add() call -# to something that will highlight to the user what we are working on if -# something goes wrong and an error message is produced. - -project(${contentName}-populate NONE) - -include(ExternalProject) -ExternalProject_Add(${contentName}-populate - ${ARG_EXTRA} - SOURCE_DIR "${ARG_SOURCE_DIR}" - BINARY_DIR "${ARG_BINARY_DIR}" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) diff --git a/cmake/Modules/README.md b/cmake/Modules/README.md deleted file mode 100644 index c8d275f11..000000000 --- a/cmake/Modules/README.md +++ /dev/null @@ -1,5 +0,0 @@ - -## FetchContent - -`FetchContent.cmake` and `FetchContent/CMakeLists.cmake.in` -are copied from `cmake/3.11.0/share/cmake-3.11/Modules`. diff --git a/cmake/asio.cmake b/cmake/asio.cmake new file mode 100644 index 000000000..c1d01cdc7 --- /dev/null +++ b/cmake/asio.cmake @@ -0,0 +1,44 @@ +function(download_asio) + include(FetchContent) + + set(asio_URL "https://github.com/chriskohlhoff/asio/archive/refs/tags/asio-1-24-0.tar.gz") + set(asio_URL2 "https://huggingface.co/csukuangfj/sherpa-cmake-deps/resolve/main/asio-asio-1-24-0.tar.gz") + set(asio_HASH "SHA256=cbcaaba0f66722787b1a7c33afe1befb3a012b5af3ad7da7ff0f6b8c9b7a8a5b") + + # If you don't have access to the Internet, + # please pre-download asio + set(possible_file_locations + $ENV{HOME}/Downloads/asio-asio-1-24-0.tar.gz + ${PROJECT_SOURCE_DIR}/asio-asio-1-24-0.tar.gz + ${PROJECT_BINARY_DIR}/asio-asio-1-24-0.tar.gz + /tmp/asio-asio-1-24-0.tar.gz + /star-fj/fangjun/download/github/asio-asio-1-24-0.tar.gz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(asio_URL "${f}") + file(TO_CMAKE_PATH "${asio_URL}" asio_URL) + set(asio_URL2) + break() + endif() + endforeach() + + FetchContent_Declare(asio + URL + ${asio_URL} + ${asio_URL2} + URL_HASH ${asio_HASH} + ) + + FetchContent_GetProperties(asio) + if(NOT asio_POPULATED) + message(STATUS "Downloading asio ${asio_URL}") + FetchContent_Populate(asio) + endif() + message(STATUS "asio is downloaded to ${asio_SOURCE_DIR}") + # add_subdirectory(${asio_SOURCE_DIR} ${asio_BINARY_DIR} EXCLUDE_FROM_ALL) + include_directories(${asio_SOURCE_DIR}/asio/include) +endfunction() + +download_asio() diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index f508fa3b2..9d419eab3 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -1,4 +1,5 @@ # Copyright (c) 2021-2022 Xiaomi Corporation (author: Fangjun Kuang) +# flake8: noqa import os import platform @@ -8,7 +9,7 @@ import setuptools from setuptools.command.build_ext import build_ext - +import torch def is_for_pypi(): ans = os.environ.get("SHERPA_IS_FOR_PYPI", None) @@ -23,6 +24,11 @@ def is_windows(): return platform.system() == "Windows" +def get_pytorch_version(): + # if it is 1.7.1+cuda101, then strip +cuda101 + return torch.__version__.split("+")[0] + + try: from wheel.bdist_wheel import bdist_wheel as _bdist_wheel @@ -76,6 +82,12 @@ def build_extension(self, ext: setuptools.extension.Extension): print(f"Setting PYTHON_EXECUTABLE to {sys.executable}") cmake_args += f" -DPYTHON_EXECUTABLE={sys.executable}" + major, minor = get_pytorch_version().split(".")[:2] + major = int(major) + minor = int(minor) + if major == 2 and minor >= 1: + extra_cmake_args += f" -DCMAKE_CXX_STANDARD=17 " + cmake_args += extra_cmake_args if is_windows(): @@ -107,7 +119,7 @@ def build_extension(self, ext: setuptools.extension.Extension): cmake {cmake_args} {sherpa_dir} - make {make_args} install + make {make_args} install/strip """ print(f"build command is:\n{build_cmd}") @@ -119,7 +131,20 @@ def build_extension(self, ext: setuptools.extension.Extension): "\nClick:\n\thttps://github.com/k2-fsa/sherpa/issues/new\n" # noqa ) - for f in ["sherpa", "sherpa-version"]: - src_file = install_dir / "bin" / f + suffix = ".exe" if is_windows() else "" + # Remember to also change setup.py + binaries = ["sherpa-offline"] + binaries += ["sherpa-online", "sherpa-version"] + binaries += ["sherpa-online-microphone"] + binaries += ["sherpa-offline-microphone"] + binaries += ["sherpa-offline-websocket-server"] + binaries += ["sherpa-offline-websocket-client"] + binaries += ["sherpa-online-websocket-server"] + binaries += ["sherpa-online-websocket-client"] + binaries += ["sherpa-online-websocket-client-microphone"] + + for f in binaries: + src_file = install_dir / "bin" / (f + suffix) print(f"Copying {src_file} to {out_bin_dir}/") shutil.copy(f"{src_file}", f"{out_bin_dir}/") + src_file.unlink() diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake index 0252268cd..d5ff6c620 100644 --- a/cmake/googletest.cmake +++ b/cmake/googletest.cmake @@ -1,31 +1,28 @@ -# Copyright 2020 Fangjun Kuang (csukuangfj@gmail.com) -# See ../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - function(download_googltest) - if(CMAKE_VERSION VERSION_LESS 3.11) - # FetchContent is available since 3.11, - # we've copied it to ${CMAKE_SOURCE_DIR}/cmake/Modules - # so that it can be used in lower CMake versions. - message(STATUS "Use FetchContent provided by k2") - list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) - endif() - include(FetchContent) - set(googletest_URL "https://github.com/google/googletest/archive/release-1.10.0.tar.gz") - set(googletest_HASH "SHA256=9dc9157a9a1551ec7a7e43daea9a694a0bb5fb8bec81235d8a1e6ef64c716dcb") + set(googletest_URL "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz") + set(googletest_URL2 "https://huggingface.co/csukuangfj/sherpa-cmake-deps/resolve/main/googletest-1.13.0.tar.gz") + set(googletest_HASH "SHA256=ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363") + + # If you don't have access to the Internet, + # please pre-download googletest + set(possible_file_locations + $ENV{HOME}/Downloads/googletest-1.13.0.tar.gz + ${PROJECT_SOURCE_DIR}/googletest-1.13.0.tar.gz + ${PROJECT_BINARY_DIR}/googletest-1.13.0.tar.gz + /tmp/googletest-1.13.0.tar.gz + /star-fj/fangjun/download/github/googletest-1.13.0.tar.gz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(googletest_URL "${f}") + file(TO_CMAKE_PATH "${googletest_URL}" googletest_URL) + set(googletest_URL2) + break() + endif() + endforeach() set(BUILD_GMOCK ON CACHE BOOL "" FORCE) set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) @@ -33,13 +30,15 @@ function(download_googltest) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_Declare(googletest - URL ${googletest_URL} + URL + ${googletest_URL} + ${googletest_URL2} URL_HASH ${googletest_HASH} ) FetchContent_GetProperties(googletest) if(NOT googletest_POPULATED) - message(STATUS "Downloading googletest") + message(STATUS "Downloading googletest from ${googletest_URL}") FetchContent_Populate(googletest) endif() message(STATUS "googletest is downloaded to ${googletest_SOURCE_DIR}") diff --git a/cmake/grpc.cmake b/cmake/grpc.cmake new file mode 100644 index 000000000..4451b260d --- /dev/null +++ b/cmake/grpc.cmake @@ -0,0 +1,17 @@ +function(download_grpc) + message(STATUS "Using gRPC via add_subdirectory") + include(FetchContent) + #SET(CMAKE_CXX_FLAGS "-DBUILD_SHARED_LIBS=ON") + + set(ABSL_ENABLE_INSTALL ON) + FetchContent_Declare(gRPC + GIT_REPOSITORY https://github.com/grpc/grpc + GIT_TAG v1.57.0 + ) + set(FETCHCONTENT_QUIET OFF) + FetchContent_MakeAvailable(gRPC) + + message(STATUS "grpc is downloaded to ${grpc_SOURCE_DIR}") +endfunction() + +download_grpc() diff --git a/cmake/json.cmake b/cmake/json.cmake new file mode 100644 index 000000000..3ec935b30 --- /dev/null +++ b/cmake/json.cmake @@ -0,0 +1,44 @@ +function(download_json) + include(FetchContent) + + set(json_URL "https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.tar.gz") + set(json_URL2 "https://huggingface.co/csukuangfj/sherpa-cmake-deps/resolve/main/json-3.11.2.tar.gz") + set(json_HASH "SHA256=d69f9deb6a75e2580465c6c4c5111b89c4dc2fa94e3a85fcd2ffcd9a143d9273") + + # If you don't have access to the Internet, + # please pre-download json + set(possible_file_locations + $ENV{HOME}/Downloads/json-3.11.2.tar.gz + ${PROJECT_SOURCE_DIR}/json-3.11.2.tar.gz + ${PROJECT_BINARY_DIR}/json-3.11.2.tar.gz + /tmp/json-3.11.2.tar.gz + /star-fj/fangjun/download/github/json-3.11.2.tar.gz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(json_URL "${f}") + file(TO_CMAKE_PATH "${json_URL}" json_URL) + set(json_URL2) + break() + endif() + endforeach() + + FetchContent_Declare(json + URL + ${json_URL} + ${json_URL2} + URL_HASH ${json_HASH} + ) + + FetchContent_GetProperties(json) + if(NOT json_POPULATED) + message(STATUS "Downloading json from ${json_URL}") + FetchContent_Populate(json) + endif() + message(STATUS "json is downloaded to ${json_SOURCE_DIR}") + include_directories(${json_SOURCE_DIR}/include) + # Use #include "nlohmann/json.hpp" +endfunction() + +download_json() diff --git a/cmake/k2.cmake b/cmake/k2.cmake index b80438db3..ca4234e37 100644 --- a/cmake/k2.cmake +++ b/cmake/k2.cmake @@ -1,4 +1,3 @@ - if(DEFINED ENV{K2_INSTALL_PREFIX}) message(STATUS "Using environment variable K2_INSTALL_PREFIX: $ENV{K2_INSTALL_PREFIX}") set(K2_CMAKE_PREFIX_PATH $ENV{K2_INSTALL_PREFIX}) @@ -16,7 +15,7 @@ endif() message(STATUS "K2_CMAKE_PREFIX_PATH: ${K2_CMAKE_PREFIX_PATH}") list(APPEND CMAKE_PREFIX_PATH "${K2_CMAKE_PREFIX_PATH}") -find_package(k2 REQUIRED) +find_package(k2 1.23.2 REQUIRED) message(STATUS "K2_FOUND: ${K2_FOUND}") message(STATUS "K2_INCLUDE_DIRS: ${K2_INCLUDE_DIRS}") diff --git a/cmake/kaldi_native_io.cmake b/cmake/kaldi_native_io.cmake index b729e415d..c08535c03 100644 --- a/cmake/kaldi_native_io.cmake +++ b/cmake/kaldi_native_io.cmake @@ -1,28 +1,42 @@ function(download_kaldi_native_io) - if(CMAKE_VERSION VERSION_LESS 3.11) - # FetchContent is available since 3.11, - # we've copied it to ${CMAKE_SOURCE_DIR}/cmake/Modules - # so that it can be used in lower CMake versions. - message(STATUS "Use FetchContent provided by sherpa") - list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) - endif() - include(FetchContent) - set(kaldi_native_io_URL "https://github.com/csukuangfj/kaldi_native_io/archive/refs/tags/v1.14.tar.gz") - set(kaldi_native_io_HASH "SHA256=c7dc0a2cda061751a121094ad850f8575f3552d223747021aba0b3abd3827622") + set(kaldi_native_io_URL "https://github.com/csukuangfj/kaldi_native_io/archive/refs/tags/v1.22.1.tar.gz") + set(kaldi_native_io_URL2 "https://huggingface.co/csukuangfj/sherpa-cmake-deps/resolve/main/kaldi_native_io-1.22.1.tar.gz") + set(kaldi_native_io_HASH "SHA256=de8ad3398162870b4e1f3ac78101af209c981a8628555710ed4f1adbfed0af43") + + # If you don't have access to the Internet, + # please pre-download kaldi_native_io + set(possible_file_locations + $ENV{HOME}/Downloads/kaldi_native_io-1.22.1.tar.gz + ${PROJECT_SOURCE_DIR}/kaldi_native_io-1.22.1.tar.gz + ${PROJECT_BINARY_DIR}/kaldi_native_io-1.22.1.tar.gz + /tmp/kaldi_native_io-1.22.1.tar.gz + /star-fj/fangjun/download/github/kaldi_native_io-1.22.1.tar.gz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(kaldi_native_io_URL "${f}") + file(TO_CMAKE_PATH "${kaldi_native_io_URL}" kaldi_native_io_URL) + set(kaldi_native_io_URL2) + break() + endif() + endforeach() set(KALDI_NATIVE_IO_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(KALDI_NATIVE_IO_BUILD_PYTHON OFF CACHE BOOL "" FORCE) FetchContent_Declare(kaldi_native_io - URL ${kaldi_native_io_URL} + URL + ${kaldi_native_io_URL} + ${kaldi_native_io_URL2} URL_HASH ${kaldi_native_io_HASH} ) FetchContent_GetProperties(kaldi_native_io) if(NOT kaldi_native_io_POPULATED) - message(STATUS "Downloading kaldi_native_io${kaldi_native_io_URL}") + message(STATUS "Downloading kaldi_native_io from ${kaldi_native_io_URL}") FetchContent_Populate(kaldi_native_io) endif() message(STATUS "kaldi_native_io is downloaded to ${kaldi_native_io_SOURCE_DIR}") @@ -34,6 +48,10 @@ function(download_kaldi_native_io) PUBLIC ${kaldi_native_io_SOURCE_DIR}/ ) + + set_target_properties(kaldi_native_io_core PROPERTIES OUTPUT_NAME "sherpa_kaldi_native_io_core") + + install(TARGETS kaldi_native_io_core DESTINATION lib) endfunction() download_kaldi_native_io() diff --git a/cmake/portaudio.cmake b/cmake/portaudio.cmake new file mode 100644 index 000000000..f14534bc5 --- /dev/null +++ b/cmake/portaudio.cmake @@ -0,0 +1,66 @@ +function(download_portaudio) + include(FetchContent) + + set(portaudio_URL "http://files.portaudio.com/archives/pa_stable_v190700_20210406.tgz") + set(portaudio_URL2 "https://huggingface.co/csukuangfj/sherpa-cmake-deps/resolve/main/pa_stable_v190700_20210406.tgz") + set(portaudio_HASH "SHA256=47efbf42c77c19a05d22e627d42873e991ec0c1357219c0d74ce6a2948cb2def") + + # If you don't have access to the Internet, please download it to your + # local drive and modify the following line according to your needs. + set(possible_file_locations + $ENV{HOME}/Downloads/pa_stable_v190700_20210406.tgz + $ENV{HOME}/asr/pa_stable_v190700_20210406.tgz + ${PROJECT_SOURCE_DIR}/pa_stable_v190700_20210406.tgz + ${PROJECT_BINARY_DIR}/pa_stable_v190700_20210406.tgz + /tmp/pa_stable_v190700_20210406.tgz + /star-fj/fangjun/download/github/pa_stable_v190700_20210406.tgz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(portaudio_URL "${f}") + file(TO_CMAKE_PATH "${portaudio_URL}" portaudio_URL) + set(portaudio_URL2) + break() + endif() + endforeach() + + if(BUILD_SHARED_LIBS) + set(PA_BUILD_SHARED ON CACHE BOOL "" FORCE) + set(PA_BUILD_STATIC OFF CACHE BOOL "" FORCE) + else() + set(PA_BUILD_SHARED OFF CACHE BOOL "" FORCE) + set(PA_BUILD_STATIC ON CACHE BOOL "" FORCE) + endif() + + FetchContent_Declare(portaudio + URL + ${portaudio_URL} + ${portaudio_URL2} + URL_HASH ${portaudio_HASH} + ) + + FetchContent_GetProperties(portaudio) + if(NOT portaudio_POPULATED) + message(STATUS "Downloading portaudio from ${portaudio_URL}") + FetchContent_Populate(portaudio) + endif() + message(STATUS "portaudio is downloaded to ${portaudio_SOURCE_DIR}") + message(STATUS "portaudio's binary dir is ${portaudio_BINARY_DIR}") + + add_subdirectory(${portaudio_SOURCE_DIR} ${portaudio_BINARY_DIR} EXCLUDE_FROM_ALL) + + if(BUILD_SHARED_LIBS) + set_target_properties(portaudio PROPERTIES OUTPUT_NAME "sherpa_portaudio") + install(TARGETS portaudio DESTINATION lib) + else() + set_target_properties(portaudio_static PROPERTIES OUTPUT_NAME "sherpa_portaudio_static") + install(TARGETS portaudio_static DESTINATION lib) + endif() +endfunction() + +download_portaudio() + +# Note +# See http://portaudio.com/docs/v19-doxydocs/tutorial_start.html +# for how to use portaudio diff --git a/cmake/pybind11.cmake b/cmake/pybind11.cmake index 76cf090e4..eeeb8fe5a 100644 --- a/cmake/pybind11.cmake +++ b/cmake/pybind11.cmake @@ -1,21 +1,39 @@ function(download_pybind11) - if(CMAKE_VERSION VERSION_LESS 3.11) - list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) - endif() - include(FetchContent) - set(pybind11_URL "https://github.com/pybind/pybind11/archive/v2.9.2.tar.gz") - set(pybind11_HASH "SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1") + set(pybind11_URL "https://github.com/pybind/pybind11/archive/5bc0943ed96836f46489f53961f6c438d2935357.zip") + set(pybind11_URL2 "https://huggingface.co/csukuangfj/k2-cmake-deps/resolve/main/pybind11-5bc0943ed96836f46489f53961f6c438d2935357.zip") + set(pybind11_HASH "SHA256=ff65a1a8c9e6ceec11e7ed9d296f2e22a63e9ff0c4264b3af29c72b4f18f25a0") + + # If you don't have access to the Internet, + # please pre-download pybind11 + set(possible_file_locations + $ENV{HOME}/Downloads/pybind11-5bc0943ed96836f46489f53961f6c438d2935357.zip + ${PROJECT_SOURCE_DIR}/pybind11-5bc0943ed96836f46489f53961f6c438d2935357.zip + ${PROJECT_BINARY_DIR}/pybind11-5bc0943ed96836f46489f53961f6c438d2935357.zip + /tmp/pybind11-5bc0943ed96836f46489f53961f6c438d2935357.zip + /star-fj/fangjun/download/github/pybind11-5bc0943ed96836f46489f53961f6c438d2935357.zip + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(pybind11_URL "${f}") + file(TO_CMAKE_PATH "${pybind11_URL}" pybind11_URL) + set(pybind11_URL2) + break() + endif() + endforeach() FetchContent_Declare(pybind11 - URL ${pybind11_URL} + URL + ${pybind11_URL} + ${pybind11_URL2} URL_HASH ${pybind11_HASH} ) FetchContent_GetProperties(pybind11) if(NOT pybind11_POPULATED) - message(STATUS "Downloading pybind11") + message(STATUS "Downloading pybind11 from ${pybind11_URL}") FetchContent_Populate(pybind11) endif() message(STATUS "pybind11 is downloaded to ${pybind11_SOURCE_DIR}") diff --git a/cmake/websocketpp.cmake b/cmake/websocketpp.cmake new file mode 100644 index 000000000..33bbbc348 --- /dev/null +++ b/cmake/websocketpp.cmake @@ -0,0 +1,45 @@ +function(download_websocketpp) + include(FetchContent) + + # The latest commit on the develop branch os as 2022-10-22 + set(websocketpp_URL "https://github.com/zaphoyd/websocketpp/archive/b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip") + set(websocketpp_URL2 "https://huggingface.co/csukuangfj/sherpa-cmake-deps/resolve/main/websocketpp-b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip") + set(websocketpp_HASH "SHA256=1385135ede8191a7fbef9ec8099e3c5a673d48df0c143958216cd1690567f583") + + # If you don't have access to the Internet, + # please pre-download websocketpp + set(possible_file_locations + $ENV{HOME}/Downloads/websocketpp-b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip + ${PROJECT_SOURCE_DIR}/websocketpp-b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip + ${PROJECT_BINARY_DIR}/websocketpp-b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip + /tmp/websocketpp-b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip + /star-fj/fangjun/download/github/websocketpp-b9aeec6eaf3d5610503439b4fae3581d9aff08e8.zip + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(websocketpp_URL "${f}") + file(TO_CMAKE_PATH "${websocketpp_URL}" websocketpp_URL) + set(websocketpp_URL2) + break() + endif() + endforeach() + + FetchContent_Declare(websocketpp + URL + ${websocketpp_URL} + ${websocketpp_URL2} + URL_HASH ${websocketpp_HASH} + ) + + FetchContent_GetProperties(websocketpp) + if(NOT websocketpp_POPULATED) + message(STATUS "Downloading websocketpp from ${websocketpp_URL}") + FetchContent_Populate(websocketpp) + endif() + message(STATUS "websocketpp is downloaded to ${websocketpp_SOURCE_DIR}") + # add_subdirectory(${websocketpp_SOURCE_DIR} ${websocketpp_BINARY_DIR} EXCLUDE_FROM_ALL) + include_directories(${websocketpp_SOURCE_DIR}) +endfunction() + +download_websocketpp() diff --git a/docs/requirements.txt b/docs/requirements.txt index 2ec5d24f2..504c3f938 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,7 @@ recommonmark==0.7.1 -sphinx==4.3.2 +sphinx==5.0.0 sphinx-autodoc-typehints==1.12.0 sphinx_rtd_theme==1.0.0 sphinxcontrib-bibtex==2.4.1 sphinxcontrib-youtube==1.1.0 +sphinx-tabs diff --git a/docs/source/_static/.gitignore b/docs/source/_static/.gitignore new file mode 100644 index 000000000..9c42fcdd4 --- /dev/null +++ b/docs/source/_static/.gitignore @@ -0,0 +1 @@ +!*.wav diff --git a/docs/source/_static/audio-tagging/zipformer-small/1.wav b/docs/source/_static/audio-tagging/zipformer-small/1.wav new file mode 100644 index 000000000..9ce6c583c Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/1.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/10.wav b/docs/source/_static/audio-tagging/zipformer-small/10.wav new file mode 100644 index 000000000..eaee99c2d Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/10.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/11.wav b/docs/source/_static/audio-tagging/zipformer-small/11.wav new file mode 100644 index 000000000..deba35a31 Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/11.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/12.wav b/docs/source/_static/audio-tagging/zipformer-small/12.wav new file mode 100644 index 000000000..cc0ab100a Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/12.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/13.wav b/docs/source/_static/audio-tagging/zipformer-small/13.wav new file mode 100644 index 000000000..a6466de72 Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/13.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/2.wav b/docs/source/_static/audio-tagging/zipformer-small/2.wav new file mode 100644 index 000000000..2eef217cb Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/2.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/3.wav b/docs/source/_static/audio-tagging/zipformer-small/3.wav new file mode 100644 index 000000000..ed79deabd Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/3.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/4.wav b/docs/source/_static/audio-tagging/zipformer-small/4.wav new file mode 100644 index 000000000..a44dae3b3 Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/4.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/5.wav b/docs/source/_static/audio-tagging/zipformer-small/5.wav new file mode 100644 index 000000000..f846a8ac0 Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/5.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/6.wav b/docs/source/_static/audio-tagging/zipformer-small/6.wav new file mode 100644 index 000000000..587746bbc Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/6.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/7.wav b/docs/source/_static/audio-tagging/zipformer-small/7.wav new file mode 100644 index 000000000..60105566d Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/7.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/8.wav b/docs/source/_static/audio-tagging/zipformer-small/8.wav new file mode 100644 index 000000000..71f448b15 Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/8.wav differ diff --git a/docs/source/_static/audio-tagging/zipformer-small/9.wav b/docs/source/_static/audio-tagging/zipformer-small/9.wav new file mode 100644 index 000000000..9ddec091b Binary files /dev/null and b/docs/source/_static/audio-tagging/zipformer-small/9.wav differ diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 000000000..2c6806323 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,12 @@ +.toggle .header { + display: block; + clear: both; +} + +.toggle .header:after { + content: " ▶"; +} + +.toggle .header.open:after { + content: " ▼"; +} diff --git a/docs/source/_static/kokoro-en-v0_19/10-bm_lewis.wav b/docs/source/_static/kokoro-en-v0_19/10-bm_lewis.wav new file mode 100644 index 000000000..d3b6761cb Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/10-bm_lewis.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/2-af_nicole.wav b/docs/source/_static/kokoro-en-v0_19/2-af_nicole.wav new file mode 100644 index 000000000..749adc1d5 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/2-af_nicole.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/0-af.wav b/docs/source/_static/kokoro-en-v0_19/sid/0-af.wav new file mode 100644 index 000000000..d013847f6 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/0-af.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/1-af_bella.wav b/docs/source/_static/kokoro-en-v0_19/sid/1-af_bella.wav new file mode 100644 index 000000000..31de45503 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/1-af_bella.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/10-bm_lewis.wav b/docs/source/_static/kokoro-en-v0_19/sid/10-bm_lewis.wav new file mode 100644 index 000000000..7e2ca34fa Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/10-bm_lewis.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/2-af_nicole.wav b/docs/source/_static/kokoro-en-v0_19/sid/2-af_nicole.wav new file mode 100644 index 000000000..871420565 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/2-af_nicole.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/3-af_sarah.wav b/docs/source/_static/kokoro-en-v0_19/sid/3-af_sarah.wav new file mode 100644 index 000000000..0af228b58 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/3-af_sarah.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/4-af_sky.wav b/docs/source/_static/kokoro-en-v0_19/sid/4-af_sky.wav new file mode 100644 index 000000000..bcbe45da4 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/4-af_sky.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/5-am_adam.wav b/docs/source/_static/kokoro-en-v0_19/sid/5-am_adam.wav new file mode 100644 index 000000000..9b582996d Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/5-am_adam.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/6-am_michael.wav b/docs/source/_static/kokoro-en-v0_19/sid/6-am_michael.wav new file mode 100644 index 000000000..3c79b767e Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/6-am_michael.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/7-bf_emma.wav b/docs/source/_static/kokoro-en-v0_19/sid/7-bf_emma.wav new file mode 100644 index 000000000..4945b70a2 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/7-bf_emma.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/8-bf_isabella.wav b/docs/source/_static/kokoro-en-v0_19/sid/8-bf_isabella.wav new file mode 100644 index 000000000..331025d8a Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/8-bf_isabella.wav differ diff --git a/docs/source/_static/kokoro-en-v0_19/sid/9-bm_george.wav b/docs/source/_static/kokoro-en-v0_19/sid/9-bm_george.wav new file mode 100644 index 000000000..52f264671 Binary files /dev/null and b/docs/source/_static/kokoro-en-v0_19/sid/9-bm_george.wav differ diff --git a/docs/source/_static/kokoro-multi-lang-v1_0/.gitignore b/docs/source/_static/kokoro-multi-lang-v1_0/.gitignore new file mode 100644 index 000000000..d8dd7532a --- /dev/null +++ b/docs/source/_static/kokoro-multi-lang-v1_0/.gitignore @@ -0,0 +1 @@ +*.wav diff --git a/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-0.wav b/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-0.wav new file mode 100644 index 000000000..2c8be048b Binary files /dev/null and b/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-0.wav differ diff --git a/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-1.wav b/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-1.wav new file mode 100644 index 000000000..e27d15383 Binary files /dev/null and b/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-1.wav differ diff --git a/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-2.wav b/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-2.wav new file mode 100644 index 000000000..50c9a9bec Binary files /dev/null and b/docs/source/_static/matcha-icefall-baker-zh/matcha-baker-2.wav differ diff --git a/docs/source/_static/matcha-icefall-en_US-ljspeech/matcha-ljspeech-0.wav b/docs/source/_static/matcha-icefall-en_US-ljspeech/matcha-ljspeech-0.wav new file mode 100644 index 000000000..d6726715b Binary files /dev/null and b/docs/source/_static/matcha-icefall-en_US-ljspeech/matcha-ljspeech-0.wav differ diff --git a/docs/source/_static/matcha-icefall-en_US-ljspeech/matcha-ljspeech-1.wav b/docs/source/_static/matcha-icefall-en_US-ljspeech/matcha-ljspeech-1.wav new file mode 100644 index 000000000..76c3f7b2b Binary files /dev/null and b/docs/source/_static/matcha-icefall-en_US-ljspeech/matcha-ljspeech-1.wav differ diff --git a/docs/source/_static/mms/mms-eng.wav b/docs/source/_static/mms/mms-eng.wav new file mode 100644 index 000000000..cef186916 Binary files /dev/null and b/docs/source/_static/mms/mms-eng.wav differ diff --git a/docs/source/_static/onnx/riscv64/a-test.wav b/docs/source/_static/onnx/riscv64/a-test.wav new file mode 100644 index 000000000..7b590c823 Binary files /dev/null and b/docs/source/_static/onnx/riscv64/a-test.wav differ diff --git a/docs/source/_static/piper/test.wav b/docs/source/_static/piper/test.wav new file mode 100644 index 000000000..8c2a118ed Binary files /dev/null and b/docs/source/_static/piper/test.wav differ diff --git a/docs/source/_static/sense-voice/.gitignore b/docs/source/_static/sense-voice/.gitignore new file mode 100644 index 000000000..d8dd7532a --- /dev/null +++ b/docs/source/_static/sense-voice/.gitignore @@ -0,0 +1 @@ +*.wav diff --git a/docs/source/_static/sense-voice/python-websocket/test.wav b/docs/source/_static/sense-voice/python-websocket/test.wav new file mode 100644 index 000000000..01c3cb3c1 Binary files /dev/null and b/docs/source/_static/sense-voice/python-websocket/test.wav differ diff --git a/docs/source/_static/sherpa-onnx-vits-zh-ll/0-value-2x.wav b/docs/source/_static/sherpa-onnx-vits-zh-ll/0-value-2x.wav new file mode 100644 index 000000000..0e17d2946 Binary files /dev/null and b/docs/source/_static/sherpa-onnx-vits-zh-ll/0-value-2x.wav differ diff --git a/docs/source/_static/sherpa-onnx-vits-zh-ll/1-numbers.wav b/docs/source/_static/sherpa-onnx-vits-zh-ll/1-numbers.wav new file mode 100644 index 000000000..fe8807949 Binary files /dev/null and b/docs/source/_static/sherpa-onnx-vits-zh-ll/1-numbers.wav differ diff --git a/docs/source/_static/sherpa-onnx-vits-zh-ll/2-numbers.wav b/docs/source/_static/sherpa-onnx-vits-zh-ll/2-numbers.wav new file mode 100644 index 000000000..a21e85494 Binary files /dev/null and b/docs/source/_static/sherpa-onnx-vits-zh-ll/2-numbers.wav differ diff --git a/docs/source/_static/sherpa-onnx-vits-zh-ll/3-wo-mi.wav b/docs/source/_static/sherpa-onnx-vits-zh-ll/3-wo-mi.wav new file mode 100644 index 000000000..cb8147897 Binary files /dev/null and b/docs/source/_static/sherpa-onnx-vits-zh-ll/3-wo-mi.wav differ diff --git a/docs/source/_static/sherpa-onnx-vits-zh-ll/4-heteronym.wav b/docs/source/_static/sherpa-onnx-vits-zh-ll/4-heteronym.wav new file mode 100644 index 000000000..542d33eaa Binary files /dev/null and b/docs/source/_static/sherpa-onnx-vits-zh-ll/4-heteronym.wav differ diff --git a/docs/source/_static/speech-enhancement/gtcrn-simple/enhanced-16k.wav b/docs/source/_static/speech-enhancement/gtcrn-simple/enhanced-16k.wav new file mode 100644 index 000000000..c49ac49b2 Binary files /dev/null and b/docs/source/_static/speech-enhancement/gtcrn-simple/enhanced-16k.wav differ diff --git a/docs/source/_static/speech-enhancement/gtcrn-simple/speech_with_noise.wav b/docs/source/_static/speech-enhancement/gtcrn-simple/speech_with_noise.wav new file mode 100644 index 000000000..425d7c043 Binary files /dev/null and b/docs/source/_static/speech-enhancement/gtcrn-simple/speech_with_noise.wav differ diff --git a/docs/source/_static/vits-ljs/armstrong.wav b/docs/source/_static/vits-ljs/armstrong.wav new file mode 100644 index 000000000..3b790b1eb Binary files /dev/null and b/docs/source/_static/vits-ljs/armstrong.wav differ diff --git a/docs/source/_static/vits-ljs/liliana.wav b/docs/source/_static/vits-ljs/liliana.wav new file mode 100644 index 000000000..cc9920804 Binary files /dev/null and b/docs/source/_static/vits-ljs/liliana.wav differ diff --git a/docs/source/_static/vits-melo-tts/zh-en-0.wav b/docs/source/_static/vits-melo-tts/zh-en-0.wav new file mode 100644 index 000000000..ade6bdd40 Binary files /dev/null and b/docs/source/_static/vits-melo-tts/zh-en-0.wav differ diff --git a/docs/source/_static/vits-melo-tts/zh-en-1.wav b/docs/source/_static/vits-melo-tts/zh-en-1.wav new file mode 100644 index 000000000..b56a5967c Binary files /dev/null and b/docs/source/_static/vits-melo-tts/zh-en-1.wav differ diff --git a/docs/source/_static/vits-melo-tts/zh-en-2.wav b/docs/source/_static/vits-melo-tts/zh-en-2.wav new file mode 100644 index 000000000..016662b7a Binary files /dev/null and b/docs/source/_static/vits-melo-tts/zh-en-2.wav differ diff --git a/docs/source/_static/vits-melo-tts/zh-en-3.wav b/docs/source/_static/vits-melo-tts/zh-en-3.wav new file mode 100644 index 000000000..e85288bce Binary files /dev/null and b/docs/source/_static/vits-melo-tts/zh-en-3.wav differ diff --git a/docs/source/_static/vits-piper-glados/glados-bug.wav b/docs/source/_static/vits-piper-glados/glados-bug.wav new file mode 100644 index 000000000..6c7453b32 Binary files /dev/null and b/docs/source/_static/vits-piper-glados/glados-bug.wav differ diff --git a/docs/source/_static/vits-piper-glados/glados-code.wav b/docs/source/_static/vits-piper-glados/glados-code.wav new file mode 100644 index 000000000..c667542b9 Binary files /dev/null and b/docs/source/_static/vits-piper-glados/glados-code.wav differ diff --git a/docs/source/_static/vits-piper-glados/glados-liliana.wav b/docs/source/_static/vits-piper-glados/glados-liliana.wav new file mode 100644 index 000000000..d3308c8af Binary files /dev/null and b/docs/source/_static/vits-piper-glados/glados-liliana.wav differ diff --git a/docs/source/_static/vits-piper-glados/glados-men.wav b/docs/source/_static/vits-piper-glados/glados-men.wav new file mode 100644 index 000000000..11c89b203 Binary files /dev/null and b/docs/source/_static/vits-piper-glados/glados-men.wav differ diff --git a/docs/source/_static/vits-piper-glados/glados-ship.wav b/docs/source/_static/vits-piper-glados/glados-ship.wav new file mode 100644 index 000000000..d70c94075 Binary files /dev/null and b/docs/source/_static/vits-piper-glados/glados-ship.wav differ diff --git a/docs/source/_static/vits-piper-libritts/libritts-armstrong-200.wav b/docs/source/_static/vits-piper-libritts/libritts-armstrong-200.wav new file mode 100644 index 000000000..c3644bb68 Binary files /dev/null and b/docs/source/_static/vits-piper-libritts/libritts-armstrong-200.wav differ diff --git a/docs/source/_static/vits-piper-libritts/libritts-armstrong-500.wav b/docs/source/_static/vits-piper-libritts/libritts-armstrong-500.wav new file mode 100644 index 000000000..dd120ac32 Binary files /dev/null and b/docs/source/_static/vits-piper-libritts/libritts-armstrong-500.wav differ diff --git a/docs/source/_static/vits-piper-libritts/libritts-liliana-109.wav b/docs/source/_static/vits-piper-libritts/libritts-liliana-109.wav new file mode 100644 index 000000000..8accdba57 Binary files /dev/null and b/docs/source/_static/vits-piper-libritts/libritts-liliana-109.wav differ diff --git a/docs/source/_static/vits-piper-libritts/libritts-liliana-900.wav b/docs/source/_static/vits-piper-libritts/libritts-liliana-900.wav new file mode 100644 index 000000000..1e9a9c5d2 Binary files /dev/null and b/docs/source/_static/vits-piper-libritts/libritts-liliana-900.wav differ diff --git a/docs/source/_static/vits-piper/armstrong-piper-en_US-lessac-medium.wav b/docs/source/_static/vits-piper/armstrong-piper-en_US-lessac-medium.wav new file mode 100644 index 000000000..9b953a082 Binary files /dev/null and b/docs/source/_static/vits-piper/armstrong-piper-en_US-lessac-medium.wav differ diff --git a/docs/source/_static/vits-piper/liliana-piper-en_US-lessac-medium.wav b/docs/source/_static/vits-piper/liliana-piper-en_US-lessac-medium.wav new file mode 100644 index 000000000..f1b84ac29 Binary files /dev/null and b/docs/source/_static/vits-piper/liliana-piper-en_US-lessac-medium.wav differ diff --git a/docs/source/_static/vits-vctk/einstein-30.wav b/docs/source/_static/vits-vctk/einstein-30.wav new file mode 100644 index 000000000..3f4ab09a8 Binary files /dev/null and b/docs/source/_static/vits-vctk/einstein-30.wav differ diff --git a/docs/source/_static/vits-vctk/franklin-66.wav b/docs/source/_static/vits-vctk/franklin-66.wav new file mode 100644 index 000000000..ae827e5c9 Binary files /dev/null and b/docs/source/_static/vits-vctk/franklin-66.wav differ diff --git a/docs/source/_static/vits-vctk/kennedy-0.wav b/docs/source/_static/vits-vctk/kennedy-0.wav new file mode 100644 index 000000000..8b248cc6b Binary files /dev/null and b/docs/source/_static/vits-vctk/kennedy-0.wav differ diff --git a/docs/source/_static/vits-vctk/kennedy-10.wav b/docs/source/_static/vits-vctk/kennedy-10.wav new file mode 100644 index 000000000..761eddb3c Binary files /dev/null and b/docs/source/_static/vits-vctk/kennedy-10.wav differ diff --git a/docs/source/_static/vits-vctk/kennedy-108.wav b/docs/source/_static/vits-vctk/kennedy-108.wav new file mode 100644 index 000000000..c563e8890 Binary files /dev/null and b/docs/source/_static/vits-vctk/kennedy-108.wav differ diff --git a/docs/source/_static/vits-vctk/martin-99.wav b/docs/source/_static/vits-vctk/martin-99.wav new file mode 100644 index 000000000..9441385bc Binary files /dev/null and b/docs/source/_static/vits-vctk/martin-99.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/demokelite-41.wav b/docs/source/_static/vits-zh-aishell3/demokelite-41.wav new file mode 100644 index 000000000..31889432f Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/demokelite-41.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/liliana-10.wav b/docs/source/_static/vits-zh-aishell3/liliana-10.wav new file mode 100644 index 000000000..9ab24281a Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/liliana-10.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/liliana-33.wav b/docs/source/_static/vits-zh-aishell3/liliana-33.wav new file mode 100644 index 000000000..4a40607b1 Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/liliana-33.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/liliana-99.wav b/docs/source/_static/vits-zh-aishell3/liliana-99.wav new file mode 100644 index 000000000..6ec1732c3 Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/liliana-99.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/liubei-21.wav b/docs/source/_static/vits-zh-aishell3/liubei-21.wav new file mode 100644 index 000000000..d4dffd226 Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/liubei-21.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/rule-103.wav b/docs/source/_static/vits-zh-aishell3/rule-103.wav new file mode 100644 index 000000000..8eee53171 Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/rule-103.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/rule-66.wav b/docs/source/_static/vits-zh-aishell3/rule-66.wav new file mode 100644 index 000000000..bedaf6874 Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/rule-66.wav differ diff --git a/docs/source/_static/vits-zh-aishell3/zhugeliang-45.wav b/docs/source/_static/vits-zh-aishell3/zhugeliang-45.wav new file mode 100644 index 000000000..0f43dc478 Binary files /dev/null and b/docs/source/_static/vits-zh-aishell3/zhugeliang-45.wav differ diff --git a/docs/source/_static/vits-zh-hf-eula/news-666.wav b/docs/source/_static/vits-zh-hf-eula/news-666.wav new file mode 100644 index 000000000..5bcc9fbf9 Binary files /dev/null and b/docs/source/_static/vits-zh-hf-eula/news-666.wav differ diff --git a/docs/source/_static/vits-zh-hf-eula/news-99.wav b/docs/source/_static/vits-zh-hf-eula/news-99.wav new file mode 100644 index 000000000..5ae5d4c03 Binary files /dev/null and b/docs/source/_static/vits-zh-hf-eula/news-99.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-C/heteronym-102.wav b/docs/source/_static/vits-zh-hf-fanchen-C/heteronym-102.wav new file mode 100644 index 000000000..3ce71dd7c Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-C/heteronym-102.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-C/numbers-100.wav b/docs/source/_static/vits-zh-hf-fanchen-C/numbers-100.wav new file mode 100644 index 000000000..4bfe50d4f Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-C/numbers-100.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-C/numbers.wav b/docs/source/_static/vits-zh-hf-fanchen-C/numbers.wav new file mode 100644 index 000000000..0f34a7fa3 Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-C/numbers.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-C/value-2x.wav b/docs/source/_static/vits-zh-hf-fanchen-C/value-2x.wav new file mode 100644 index 000000000..649440caa Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-C/value-2x.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-C/wo-mi-14.wav b/docs/source/_static/vits-zh-hf-fanchen-C/wo-mi-14.wav new file mode 100644 index 000000000..49f48f70a Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-C/wo-mi-14.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-wnj/kuayue.wav b/docs/source/_static/vits-zh-hf-fanchen-wnj/kuayue.wav new file mode 100644 index 000000000..fbbc0144f Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-wnj/kuayue.wav differ diff --git a/docs/source/_static/vits-zh-hf-fanchen-wnj/os.wav b/docs/source/_static/vits-zh-hf-fanchen-wnj/os.wav new file mode 100644 index 000000000..2eda3cf4f Binary files /dev/null and b/docs/source/_static/vits-zh-hf-fanchen-wnj/os.wav differ diff --git a/docs/source/_static/vits-zh-hf-theresa/mi14-88.wav b/docs/source/_static/vits-zh-hf-theresa/mi14-88.wav new file mode 100644 index 000000000..b2c45e273 Binary files /dev/null and b/docs/source/_static/vits-zh-hf-theresa/mi14-88.wav differ diff --git a/docs/source/_static/vits-zh-hf-theresa/reai-0.wav b/docs/source/_static/vits-zh-hf-theresa/reai-0.wav new file mode 100644 index 000000000..fd96b2151 Binary files /dev/null and b/docs/source/_static/vits-zh-hf-theresa/reai-0.wav differ diff --git a/triton/model_repo/conformer_transducer/1/.gitkeep b/docs/source/_templates/.gitkeep similarity index 100% rename from triton/model_repo/conformer_transducer/1/.gitkeep rename to docs/source/_templates/.gitkeep diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html new file mode 100644 index 000000000..399237b5f --- /dev/null +++ b/docs/source/_templates/page.html @@ -0,0 +1,15 @@ + +{% extends "!page.html" %} + +{% block footer %} + +{% endblock %} diff --git a/docs/source/conf.py b/docs/source/conf.py index e5cc8caba..13d1a7e46 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,6 +10,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +import datetime import os import re import sys @@ -20,8 +21,9 @@ # -- Project information ----------------------------------------------------- +year = datetime.date.today().year project = "sherpa" -copyright = "2022, sherpa development team" +copyright = f"2022-{year}, sherpa development team" author = "sherpa development team" @@ -45,14 +47,15 @@ def get_version(): # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + "_ext.rst_roles", "recommonmark", "sphinx.ext.autodoc", "sphinx.ext.githubpages", "sphinx.ext.napoleon", "sphinx_autodoc_typehints", "sphinx_rtd_theme", + "sphinx_tabs.tabs", "sphinxcontrib.youtube", - "_ext.rst_roles", ] # Add any paths that contain templates here, relative to this directory. @@ -110,15 +113,73 @@ def get_version(): .. _ConvEmformer: https://arxiv.org/pdf/2110.05241.pdf .. _Emformer: https://arxiv.org/pdf/2010.10759.pdf .. _LibriSpeech: https://www.openslr.org/12 +.. _CSJ: https://clrd.ninjal.ac.jp/csj/en/index.html .. _aishell: https://www.openslr.org/33 .. _sherpa: https://github.com/k2-fsa/sherpa .. _transducer: https://arxiv.org/pdf/1211.3711.pdf +.. _CTC: https://www.cs.toronto.edu/~graves/icml_2006.pdf .. _asyncio: https://docs.python.org/3/library/asyncio.html .. _k2: https://github.com/k2-fsa/k2 .. _icefall: https://github.com/k2-fsa/icefall .. _PyTorch: https://pytorch.org/ .. _Huggingface: https://huggingface.co .. _WenetSpeech: https://github.com/wenet-e2e/WenetSpeech +.. _WeNet: https://github.com/wenet-e2e/wenet .. _GigaSpeech: https://github.com/SpeechColab/GigaSpeech .. _Kaldi: https://github.com/kaldi-asr/kaldi +.. _kaldifeat: https://csukuangfj.github.io/kaldifeat/installation/index.html +.. _ncnn: https://github.com/tencent/ncnn +.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn +.. _onnx: https://github.com/onnx/onnx +.. _onnxruntime: https://github.com/microsoft/onnxruntime +.. _sherpa-onnx: https://github.com/k2-fsa/sherpa-onnx +.. _torchaudio: https://github.com/pytorch/audio +.. _Docker: https://www.docker.com +.. _Triton: https://github.com/triton-inference-server +.. _Triton-server: https://github.com/triton-inference-server/server +.. _Triton-client: https://github.com/triton-inference-server/client +.. _WebSocket: https://en.wikipedia.org/wiki/WebSocket +.. _websocketpp: https://github.com/zaphoyd/websocketpp +.. _asio: https://github.com/chriskohlhoff/asio +.. _boost: https://github.com/boostorg/boost +.. _NeMo: https://github.com/NVIDIA/NeMo +.. _CommonVoice: https://commonvoice.mozilla.org +.. _Zipformer: https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming +.. _VisionFive2: https://www.starfivetech.com/en/site/boards +.. _k2-fsa/sherpa: http://github.com/k2-fsa/sherpa +.. _k2-fsa/sherpa-onnx: http://github.com/k2-fsa/sherpa-onnx +.. _k2-fsa/sherpa-ncnn: http://github.com/k2-fsa/sherpa-ncnn +.. _srs: https://github.com/ossrs/srs +.. _RTMP: https://en.wikipedia.org/wiki/Real-Time_Messaging_Protocol +.. _Whisper: https://github.com/openai/whisper/ +.. _Go: https://en.wikipedia.org/wiki/Go_(programming_language) +.. _sherpa-onnx-go: https://github.com/k2-fsa/sherpa-onnx-go +.. _yesno: https://www.openslr.org/1/ +.. _vits: https://github.com/jaywalnut310/vits +.. _ljspeech: https://keithito.com/LJ-Speech-Dataset/ +.. _LJ Speech: https://keithito.com/LJ-Speech-Dataset/ +.. _VCTK: https://datashare.ed.ac.uk/handle/10283/2950 +.. _piper: https://github.com/rhasspy/piper +.. _aishell3: https://www.openslr.org/93/ +.. _lessac_blizzard2013: https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/ +.. _OpenFst: https://www.openfst.org/ +.. _MMS: https://huggingface.co/spaces/mms-meta/MMS +.. _WebAssembly: https://en.wikipedia.org/wiki/WebAssembly +.. _emscripten: https://emscripten.org/index.html +.. _audioset: https://research.google.com/audioset/ +.. _silero-vad: https://github.com/snakers4/silero-vad +.. _Flutter: https://flutter.dev/ +.. _Dart: https://dart.dev/ +.. _Node: https://nodejs.org/en +.. _SenseVoice: https://github.com/FunAudioLLM/SenseVoice +.. _LibriTTS-R: https://www.openslr.org/141/ +.. _ReazonSpeech: https://github.com/reazon-research/ReazonSpeech +.. _Lazarus: https://www.lazarus-ide.org/ +.. _Moonshine: https://github.com/usefulsensors/moonshine +.. _moonshine: https://github.com/usefulsensors/moonshine +.. _FireRedAsr: https://github.com/FireRedTeam/FireRedASR """ + + +def setup(app): + app.add_css_file("custom.css") diff --git a/docs/source/cpp/installation/conda-linux.rst b/docs/source/cpp/installation/conda-linux.rst deleted file mode 100644 index 1f476bcac..000000000 --- a/docs/source/cpp/installation/conda-linux.rst +++ /dev/null @@ -1,111 +0,0 @@ -conda for Linux -=============== - -.. note:: - - We recommend creating a new virtual environment to install ``sherpa``. - - -CPU version ------------ - -The command to install a CPU version of ``sherpa`` for Linux using ``conda`` is: - -.. code-block:: bash - - conda install \ - -c k2-fsa \ - -c k2-fsa-sherpa \ - -c kaldifeat \ - -c kaldi_native_io \ - -c pytorch \ - cpuonly \ - k2 \ - sherpa \ - kaldifeat \ - kaldi_native_io \ - pytorch=1.12.0 \ - python=3.8 - -or the following command in one line: - -.. code-block:: bash - - conda install -c k2-fsa -c k2-fsa-sherpa -c kaldifeat -c kaldi_native_io -c pytorch cpuonly k2 sherpa kaldifeat kaldi_native_io pytorch=1.12.0 python=3.8 - -.. note:: - - You have to specify ``cpuonly`` to install a CPU version of ``sherpa``. - -.. caution:: - - It is of paramount importance that you specify the ``-c`` options while - installing ``sherpa``. Otherwise, you will be SAD. - - You can switch the orders of different options for ``-c``, but you cannot - omit them. - -We provide pre-built conda packages for ``Python >= 3.7`` and ``PyTorch >= 1.6.0``. -Please consider installing ``sherpa`` from source if you have other requirements. - -You can use: - -.. code-block:: bash - - conda search -c k2-fsa-sherpa sherpa - -to check all available ``sherpa`` packages for different combinations of -``Python`` and ``PyTorch``. A sample output of the above command is listed below: - -.. code-block:: bash - - Loading channels: done - # Name Version Build Channel - sherpa 0.6 cpu_py3.10_torch1.11.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.10_torch1.12.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.10_torch1.12.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.10.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.10.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.10.2 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.11.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.12.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.12.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.6.0 k2-fsa-sherpa - -To check whether you have installed ``sherpa`` successfully, you can run: - -.. code-block:: bash - - sherpa --help - -which should show the usage information of ``sherpa``. - -To display the information about the environment used to build ``sherpa``, you -can use: - -.. code-block:: bash - - sherpa-version - -Read :ref:`cpp_non_streaming_asr` to find more. - -CUDA version ------------- - -To be done. - -If you have any issues about installing ``sherpa``, please create an issue -at the following address: - - ``_ - -.. hint:: - - If you have a `WeChat `_ account, you can scan - the following QR code to join the WeChat group of next-gen Kaldi to get - help. - - .. image:: pic/wechat-group-for-next-gen-kaldi.jpg - :width: 200 - :align: center - :alt: WeChat group of next-gen Kaldi diff --git a/docs/source/cpp/installation/conda-windows.rst b/docs/source/cpp/installation/conda-windows.rst deleted file mode 100644 index 0fa78df78..000000000 --- a/docs/source/cpp/installation/conda-windows.rst +++ /dev/null @@ -1,152 +0,0 @@ -conda for Windows -================= - -.. note:: - - We recommend creating a new virtual environment to install ``sherpa``. - -.. hint:: - - At present, we only provide CPU version of pre-built conda packages for - Windows. If you want to use a CUDA version of ``sherpa``, please consider - installing ``sherpa`` from source. - -The command to install ``sherpa`` for Windows using ``conda`` is: - -.. code-block:: bash - - conda install \ - -c k2-fsa \ - -c k2-fsa-sherpa \ - -c kaldifeat \ - -c kaldi_native_io \ - -c pytorch \ - k2 \ - sherpa \ - kaldifeat \ - kaldi_native_io \ - pytorch=1.12.0 \ - python=3.8 - -or the following command in one line: - -.. code-block:: bash - - conda install -c k2-fsa -c k2-fsa-sherpa -c kaldifeat -c kaldi_native_io -c pytorch k2 sherpa kaldifeat kaldi_native_io pytorch=1.12.0 python=3.8 - -.. caution:: - - It is of paramount importance that you specify the ``-c`` options while - installing ``sherpa``. Otherwise, you will be SAD. - - You can switch the orders of different options for ``-c``, but you cannot - omit them. - -We provide pre-built conda packages for ``Python >= 3.7`` and ``PyTorch >= 1.6.0``. -Please consider installing ``sherpa`` from source if you have other requirements. - - -You can use: - -.. code-block:: bash - - conda search -c k2-fsa-sherpa sherpa - -to check all available ``sherpa`` packages for different combinations of -``Python`` and ``PyTorch``. A sample output of the above command is listed below: - -.. code-block:: bash - - Loading channels: done - # Name Version Build Channel - sherpa 0.6 cpu_py3.10_torch1.11.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.10_torch1.12.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.10_torch1.12.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.10.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.10.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.10.2 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.11.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.12.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.12.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.8.0 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.8.1 k2-fsa-sherpa - sherpa 0.6 cpu_py3.7_torch1.9.0 k2-fsa-sherpa - -Settings after installation ---------------------------- - -Suppose that you have created an environment named ``sherpa`` for installation -using the following commands: - -.. code-block:: bash - - conda create -n sherpa - conda activate sherpa - -To check whether you have installed ``sherpa`` successfully or not, please run -the following command: - -.. code-block:: bash - - (sherpa) fangjun@M-0LQSDCC2RV398 C:\Users\fangjun>sherpa - 'sherpa' is not recognized as an internal or external command, - operable program or batch file. - -It reports that Windows cannot find the executable ``sherpa.exe``. We have -to set the following environment variable: - -.. code-block:: bash - - (sherpa) fangjun@M-0LQSDCC2RV398 C:\Users\fangjun>set path=%conda_prefix%\lib\site-packages\sherpa\bin;%path% - -After setting the ``path`` environment variable, we can run ``sherpa`` again: - -.. code-block:: bash - - (sherpa) fangjun@M-0LQSDCC2RV398 C:\Users\fangjun>sherpa - - (sherpa) fangjun@M-0LQSDCC2RV398 C:\Users\fangjun>sherpa --help - - (sherpa) fangjun@M-0LQSDCC2RV398 C:\Users\fangjun> - -It does not complain about being not able to find ``sherpa.exe``. However, it -prints nothing. - -The reason is that ``sherpa.exe`` cannot find ``torch_cpu.dll``. You have to -add another directory to the environment variable ``path`` using: - -.. code-block:: bash - - set path=%conda_prefix%\lib\site-packages\torch\lib;%path% - -Now you can run ``sherpa`` in the commandline: - -.. code-block:: bash - - sherpa --help - -You will get something like the following screenshot: - - .. image:: pic/conda-windows-2.png - :align: center - :alt: Output of ``shepa --help`` - -Congratulations! You have succeeded in installing ``sherpa`` on Windows. - -Read :ref:`cpp_non_streaming_asr` to find more. - -If you have any issues about installing ``sherpa``, please create an issue -at the following address: - - ``_ - -.. hint:: - - If you have a `WeChat `_ account, you can scan - the following QR code to join the WeChat group of next-gen Kaldi to get - help. - - .. image:: pic/wechat-group-for-next-gen-kaldi.jpg - :width: 200 - :align: center - :alt: WeChat group of next-gen Kaldi diff --git a/docs/source/cpp/installation/from-source.rst b/docs/source/cpp/installation/from-source.rst deleted file mode 100644 index 2ec838406..000000000 --- a/docs/source/cpp/installation/from-source.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. role:: strike - -.. _cpp_installation: - -Install from source (Linux/macOS/Windows) -========================================= - -Install dependencies --------------------- - -Install k2 -^^^^^^^^^^ - -First, please refer to ``_ -to install `k2`_. - -.. hint:: - - If you are using macOS, you can dowload pre-built wheels from - ``_ - - -Install other dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: bash - - pip install -U kaldifeat kaldi_native_io - -Install from source -------------------- - -You can select ``one`` of the following methods to install ``sherpa`` -from source. - -Option 1 -^^^^^^^^ - -.. code-block:: bash - - git clone https://github.com/k2-fsa/sherpa - cd sherpa - mkdir build - cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - make -j - - -After running the above commands, you will get two executables: -``./bin/sherpa`` and ``./bin/sherpa-version`` in the build directory. - -You can use - -.. code-block:: bash - - ./bin/sherpa --help - -to view usage information. - -``./bin/sherpa-version`` displays the information about the environment that -was used to build ``sherpa``. - -Please read the section :ref:`cpp_non_streaming_asr` for more details. - -Option 2 -^^^^^^^^ - -.. code-block:: bash - - git clone https://github.com/k2-fsa/sherpa - cd sherpa - python3 setup.py bdist_wheel - - # It will generate a file in ./dist - # For instance, if the file is ./dist/k2_sherpa-0.7.1-cp38-cp38-linux_x86_64.whl - # You can use - - pip install ./dist/k2_sherpa-0.7.1-cp38-cp38-linux_x86_64.whl - - # If you want to uninstall it, use - # - # pip uninstall k2-sherpa - # - - -.. caution:: - - The command to uninstall ``sherpa`` is ``pip uninstall k2-sherpa``, - **NOT** :strike:`pip uninstall sherpa` - -.. hint:: - - If you use ``python3 setup.py install``, you won't find the executable - ``sherpa`` and ``sherpa-version`` in your PATH. - -To check that you have installed ``sherpa`` successfully, you can use: - -.. code-block:: bash - - which sherpa - which sherpa-version - - sherpa-version - - - sherpa --help - diff --git a/docs/source/cpp/installation/index.rst b/docs/source/cpp/installation/index.rst index 1e26b4bdf..adc354b99 100644 --- a/docs/source/cpp/installation/index.rst +++ b/docs/source/cpp/installation/index.rst @@ -1,14 +1,86 @@ -.. _cpp_fronted_installation: +.. _cpp_installation: -C++ frontend Installation -========================= +Installation +============ +Before installing `sherpa`_, we assume you have installed: -You can select ``one`` of the following methods for installation. +- `PyTorch`_ +- `k2`_ +- `kaldifeat`_ -.. toctree:: - :maxdepth: 1 +You can use the following commands to install `sherpa`_: - conda-windows - conda-linux - from-source +.. code-block:: bash + + git clone http://github.com/k2-fsa/sherpa + cd sherpa + python3 setup.py bdist_wheel + ls -lh dist + pip install ./dist/k2_sherpa*.whl + +.. caution:: + + Please don't use ``python3 setup.py install``. Otherwise, you won't get + `sherpa`_ related binaries installed, such as ``sherpa-offline`` and + ``sherpa-online``. + +To uninstall `sherpa`_, please use + +.. code-block:: bash + + pip uninstall k2-sherpa + +To test that you have installed `sherpa`_ successfully, you can run the +following commands: + +.. code-block:: bash + + sherpa-version + + sherpa-offline --help + sherpa-online --help + sherpa-online-microphone --help + + sherpa-offline-websocket-server --help + sherpa-offline-websocket-client --help + + sherpa-online-websocket-server --help + sherpa-online-websocket-client --help + sherpa-online-websocket-client-microphone --help + +If you have any issues about the installation, please create an issue +at the following address: + + ``_ + +.. hint:: + + If you have a `WeChat `_ account, you can scan + the following QR code to join the WeChat group of next-gen Kaldi to get + help. + + .. image:: pic/wechat-group-for-next-gen-kaldi.jpg + :width: 200 + :align: center + :alt: WeChat group of next-gen Kaldi + + +Installation for advanced users/developers +------------------------------------------ + +As an advanced user/developer, you can use the following method to +install `sherpa`_: + + +.. code-block:: bash + + git clone http://github.com/k2-fsa/sherpa + cd sherpa + mkdir build + cd build + cmake .. + make -j + + export PATH=$PWD/bin:$PATH + export PYTHONPATH=$PWD/lib:$PWD/../sherpa/python:$PYTHONPATH diff --git a/docs/source/cpp/offline_asr/gigaspeech.rst b/docs/source/cpp/offline_asr/gigaspeech.rst index 43425848d..fc50914f0 100644 --- a/docs/source/cpp/offline_asr/gigaspeech.rst +++ b/docs/source/cpp/offline_asr/gigaspeech.rst @@ -4,7 +4,7 @@ Pretrained model with GigaSpeech .. hint:: We assume you have installed ``sherpa`` by following - :ref:`cpp_fronted_installation` before you start this section. + :ref:`cpp_installation` before you start this section. Download the pretrained model ----------------------------- diff --git a/docs/source/cpp/offline_asr/wenetspeech.rst b/docs/source/cpp/offline_asr/wenetspeech.rst index 1838dcc52..fcc3485d0 100644 --- a/docs/source/cpp/offline_asr/wenetspeech.rst +++ b/docs/source/cpp/offline_asr/wenetspeech.rst @@ -4,7 +4,7 @@ Pretrained model with WenetSpeech .. hint:: We assume you have installed ``sherpa`` by following - :ref:`cpp_fronted_installation` before you start this section. + :ref:`cpp_installation` before you start this section. Download the pretrained model ----------------------------- diff --git a/docs/source/cpp/online_asr/index.rst b/docs/source/cpp/online_asr/index.rst new file mode 100644 index 000000000..a35089298 --- /dev/null +++ b/docs/source/cpp/online_asr/index.rst @@ -0,0 +1,59 @@ +.. _cpp_streaming_asr: + +Streaming ASR +============= + +This page describes how to use the C++ API of `sherpa`_ for +streaming/online ASR. + +.. warning:: + + It supports only models from + ``_ + at present. + +Please refer to :ref:`cpp_installation` for installation. + + +After running ``make -j``, you should find the following files: + + - ``lib/libsherpa_online_recognizer.so`` + - ``include/sherpa/cpp_api/online_recognizer.h`` + - ``include/sherpa/cpp_api/online_stream.h`` + +You can include the above two header files in your application and link +``libsherpa_online_recognizer.so`` with you executable to use the C++ APIs. + + +``_ +shows how to use the C++ API for real-time speech recognition with a microphone. +After running ``make -j``, you can also find an executable ``bin/test_online_recognizer_microphone``. +The following shows how to use it: + +.. code-block:: bash + + cd /path/to/sherpa/build + + git lfs install + git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + + ./bin/test_online_recognizer_microphone \ + ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt \ + ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt + +It will print something like below: + +.. code-block:: + + num devices: 4 + Use default device: 2 + Name: MacBook Pro Microphone + Max input channels: 1 + Started + +Say something and you will see the recognition result printed to the console in real-time. + +You can find a demo below: + +.. youtube:: 86-YLg3u-WY + :width: 120% diff --git a/docs/source/cpp/pretrained_models/index.rst b/docs/source/cpp/pretrained_models/index.rst new file mode 100644 index 000000000..28963252a --- /dev/null +++ b/docs/source/cpp/pretrained_models/index.rst @@ -0,0 +1,60 @@ +.. _pretrained_models: + +Pre-trained models +================== + +Two kinds of end-to-end (E2E) models are supported by `sherpa`_: + +- CTC +- Transducer + +.. hint:: + + For transducer-based models, we only support stateless transducers. + To the best of our knowledge, only `icefall`_ supports that. In other words, + only transducer models from `icefall`_ are currently supported. + + For CTC-based models, we support any type of models trained using CTC loss + as long as you can export the model via torchscript. Models from the following + frameworks are currently supported: `icefall`_, `WeNet`_, and `torchaudio`_ (Wav2Vec 2.0). + If you have a CTC model and want it to be supported in `sherpa`, please + create an issue at ``_. + +.. hint:: + + You can try the pre-trained models in your browser without installing + anything. See ``_. + + +This page lists all available pre-trained models that you can download. + +.. hint:: + + We provide pre-trained models for the following languages: + + - Arabic + - Chinese + - English + - German + - Tibetan + + +.. hint:: + + We provide a colab notebook + |Sherpa offline recognition python api colab notebook| + for you to try offline recognition step by step. + + It shows how to install sherpa and use it as offline recognizer, + which supports the models from icefall, the `WeNet`_ framework and torchaudio. + +.. |Sherpa offline recognition python api colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/drive/1RdU06GcytTpI-r8vkQ7NkI0ugytnwJVB?usp=sharing + +.. toctree:: + :maxdepth: 5 + :caption: Pretrained models + + offline_ctc/index + offline_transducer + online_transducer diff --git a/docs/source/cpp/pretrained_models/offline_ctc/icefall.rst b/docs/source/cpp/pretrained_models/offline_ctc/icefall.rst new file mode 100644 index 000000000..072aaf292 --- /dev/null +++ b/docs/source/cpp/pretrained_models/offline_ctc/icefall.rst @@ -0,0 +1,215 @@ +icefall +======= + +.. hint:: + + We use the binary ``sherpa-offline`` below for demonstration. + You can replace ``sherpa-offline`` with ``sherpa-offline-websocket-server``. + +In this section, we list all pre-trained CTC models from `icefall`_. + +icefall-asr-gigaspeech-conformer-ctc (English) +---------------------------------------------- + +.. code-block:: bash + + # This model is trained using GigaSpeech + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/wgb14/icefall-asr-gigaspeech-conformer-ctc + cd icefall-asr-gigaspeech-conformer-ctc + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/HLG.pt" + git lfs pull --include "data/lang_bpe_500/tokens.txt" + mkdir test_wavs + cd test_wavs + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1089-134686-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0002.wav + cd .. + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --hlg=./data/lang_bpe_500/HLG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 (English) +---------------------------------------------------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/tokens.txt" + git lfs pull --include "data/lang_bpe_500/HLG.pt" + git lfs pull --include "data/lang_bpe_500/HLG_modified.pt" + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + --hlg=./data/lang_bpe_500/HLG.pt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG (modified) + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + --hlg=./data/lang_bpe_500/HLG_modified.pt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-tedlium3-conformer-ctc2 (English) +--------------------------------------------- + +.. code-block:: bash + + # This model is trained using Tedlium3 + # + # See https://github.com/k2-fsa/icefall/pull/696 + # + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/videodanchik/icefall-asr-tedlium3-conformer-ctc2 + cd icefall-asr-tedlium3-conformer-ctc2 + git lfs pull --include "exp/cpu_jit.pt" + + git lfs pull --include "data/lang_bpe/HLG.pt" + git lfs pull --include "data/lang_bpe/tokens.txt" + + git lfs pull --include "test_wavs/DanBarber_2010-219.wav" + git lfs pull --include "test_wavs/DanielKahneman_2010-157.wav" + git lfs pull --include "test_wavs/RobertGupta_2010U-15.wav" + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/DanBarber_2010-219.wav \ + ./test_wavs/DanielKahneman_2010-157.wav \ + ./test_wavs/RobertGupta_2010U-15.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --hlg=./data/lang_bpe/HLG.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/DanBarber_2010-219.wav \ + ./test_wavs/DanielKahneman_2010-157.wav \ + ./test_wavs/RobertGupta_2010U-15.wav + +icefall_asr_librispeech_conformer_ctc (English) +----------------------------------------------- + +.. code-block:: bash + + # This model is trained using LibriSpeech + # + # See https://github.com/k2-fsa/icefall/pull/13 + # + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall_asr_librispeech_conformer_ctc + cd icefall_asr_librispeech_conformer_ctc + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe/HLG.pt" + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --hlg=./data/lang_bpe/HLG.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. - ``_ + +icefall_asr_aishell_conformer_ctc (Chinese) +------------------------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall_asr_aishell_conformer_ctc + cd icefall_asr_aishell_conformer_ctc + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_char/HLG.pt" + + # Decode with an H graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_waves/BAC009S0764W0121.wav \ + ./test_waves/BAC009S0764W0122.wav \ + ./test_waves/BAC009S0764W0123.wav + + # Decode with an HLG graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + --hlg=./data/lang_char/HLG.pt \ + ./test_waves/BAC009S0764W0121.wav \ + ./test_waves/BAC009S0764W0122.wav \ + ./test_waves/BAC009S0764W0123.wav + + +icefall-asr-mgb2-conformer_ctc-2022-27-06 (Arabic) +-------------------------------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06 + cd icefall-asr-mgb2-conformer_ctc-2022-27-06 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_5000/HLG.pt" + git lfs pull --include "data/lang_bpe_5000/tokens.txt" + + # Decode with an H graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_5000/tokens.txt \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004.wav + + # Decode with an HLG graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_5000/tokens.txt \ + --hlg=./data/lang_bpe_5000/HLG.pt \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004.wav diff --git a/docs/source/cpp/pretrained_models/offline_ctc/index.rst b/docs/source/cpp/pretrained_models/offline_ctc/index.rst new file mode 100644 index 000000000..09b3bc587 --- /dev/null +++ b/docs/source/cpp/pretrained_models/offline_ctc/index.rst @@ -0,0 +1,13 @@ +Offline CTC models +================== + +This section list pre-trained CTC models from the following frameworks: + +.. toctree:: + :maxdepth: 2 + + icefall + wenet + torchaudio + nemo + diff --git a/docs/source/cpp/pretrained_models/offline_ctc/nemo.rst b/docs/source/cpp/pretrained_models/offline_ctc/nemo.rst new file mode 100644 index 000000000..1acb46b9a --- /dev/null +++ b/docs/source/cpp/pretrained_models/offline_ctc/nemo.rst @@ -0,0 +1,315 @@ +NeMo +==== + +This section lists models from `NeMo`_. + + +sherpa-nemo-ctc-en-citrinet-512 (English) +----------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-citrinet-512 + cd sherpa-nemo-ctc-en-citrinet-512 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 142M Mar 9 21:23 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-zh-citrinet-512 (Chinese) +----------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-zh-citrinet-512 + cd sherpa-nemo-ctc-zh-citrinet-512 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=true \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 153M Mar 10 15:07 model.pt + +.. hint:: + + Since the vocabulary size of this model is very large, i.e, 5207, we use + ``--modified=true`` to use a + `modified CTC topology `_ + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-zh-citrinet-1024-gamma-0-25 (Chinese) +----------------------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-zh-citrinet-1024-gamma-0-25 + cd sherpa-nemo-ctc-zh-citrinet-1024-gamma-0-25 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=true \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 557M Mar 10 16:29 model.pt + +.. hint:: + + Since the vocabulary size of this model is very large, i.e, 5207, we use + ``--modified=true`` to use a + `modified CTC topology `_ + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-de-citrinet-1024 (German) +----------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-de-citrinet-1024 + cd sherpa-nemo-ctc-de-citrinet-1024 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 541M Mar 10 16:55 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + + +sherpa-nemo-ctc-en-conformer-small (English) +-------------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-small + cd sherpa-nemo-ctc-en-conformer-small + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 82M Mar 10 19:55 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-en-conformer-medium (English) +--------------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-medium + cd sherpa-nemo-ctc-en-conformer-medium + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 152M Mar 10 20:26 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-en-conformer-large (English) +-------------------------------------------- + +This model is converted from + + ``_ + +.. hint:: + + The vocabulary size is 129 + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-large + cd sherpa-nemo-ctc-en-conformer-large + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 508M Mar 10 20:44 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-de-conformer-large (German) +------------------------------------------- + +This model is converted from + + ``_ + +.. hint:: + + The vocabulary size is 129 + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-de-conformer-large + cd sherpa-nemo-ctc-de-conformer-large + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 508M Mar 10 21:34 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +How to convert NeMo models to sherpa +------------------------------------ + +This section describes how to export `NeMo`_ pre-trained CTC models to `sherpa`_. + +You can find a list of pre-trained models from `NeMo`_ by visiting: + + ``_. + +Let us take ``stt_en_conformer_ctc_small`` as an example. + +You can use the following code to obtain ``model.pt`` and ``tokens.txt``: + +.. code-block:: bash + + import nemo.collections.asr as nemo_asr + m = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_conformer_ctc_small') + m.export("model.pt") + + with open('tokens.txt', 'w', encoding='utf-8') as f: + f.write(" 0\n") + for i, s in enumerate(m.decoder.vocabulary): + f.write(f"{s} {i+1}\n") + +One thing to note is that the blank token has the largest token ID in ``NeMo``. +However, it is always ``0`` in `sherpa`_. During network computation, we shift +the last column of the ``log_prob`` tensor to the first column so that +it matches the convention about using 0 for the blank in `sherpa`_. + +You can find the exported ``model.pt`` and ``tokens.txt`` by visiting + + ``_ diff --git a/docs/source/cpp/pretrained_models/offline_ctc/torchaudio.rst b/docs/source/cpp/pretrained_models/offline_ctc/torchaudio.rst new file mode 100644 index 000000000..65e7381f1 --- /dev/null +++ b/docs/source/cpp/pretrained_models/offline_ctc/torchaudio.rst @@ -0,0 +1,42 @@ +torchaudio +========== + +This section lists models from `torchaudio`_. + + +wav2vec2_asr_base (English) +--------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio + cd wav2vec2.0-torchaudio + + # Note: There are other kinds of models fine-tuned with different + # amount of data. We use a model that is fine-tuned with 10 minutes of data. + + git lfs pull --include "wav2vec2_asr_base_10m.pt" + + sherpa-offline \ + --nn-model=wav2vec2_asr_base_10m.pt \ + --tokens=tokens.txt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +voxpopuli_asr_base (German) +--------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio + cd wav2vec2.0-torchaudio + git lfs pull --include "voxpopuli_asr_base_10k_de.pt" + + sherpa-offline \ + --nn-model=voxpopuli_asr_base_10k_de.pt \ + --tokens=tokens-de.txt \ + --use-gpu=false \ + ./test_wavs/20120315-0900-PLENARY-14-de_20120315.wav \ + ./test_wavs/20170517-0900-PLENARY-16-de_20170517.wav diff --git a/docs/source/cpp/pretrained_models/offline_ctc/wenet.rst b/docs/source/cpp/pretrained_models/offline_ctc/wenet.rst new file mode 100644 index 000000000..ccf0bcb6a --- /dev/null +++ b/docs/source/cpp/pretrained_models/offline_ctc/wenet.rst @@ -0,0 +1,44 @@ +WeNet +===== + +This section lists models from `WeNet`_. + +wenet-english-model (English) +----------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wenet-english-model + cd wenet-english-model + git lfs pull --include "final.zip" + + sherpa-offline \ + --normalize-samples=false \ + --modified=true \ + --nn-model=./final.zip \ + --tokens=./units.txt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +wenet-chinese-model (Chinese) +----------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wenet-chinese-model + cd wenet-chinese-model + git lfs pull --include "final.zip" + + sherpa-offline \ + --normalize-samples=false \ + --modified=true \ + --nn-model=./final.zip \ + --tokens=./units.txt \ + ./test_wavs/BAC009S0764W0121.wav \ + ./test_wavs/BAC009S0764W0122.wav \ + ./test_wavs/BAC009S0764W0123.wav \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav diff --git a/docs/source/cpp/pretrained_models/offline_transducer.rst b/docs/source/cpp/pretrained_models/offline_transducer.rst new file mode 100644 index 000000000..fb8e5ffda --- /dev/null +++ b/docs/source/cpp/pretrained_models/offline_transducer.rst @@ -0,0 +1,605 @@ +.. _offline_transducer_pretrained_models: + +Offline transducer models +========================= + +.. hint:: + + We use the binary ``sherpa-offline`` below for demonstration. + You can replace ``sherpa-offline`` with ``sherpa-offline-websocket-server``. + +.. hint:: + + Please visit ``_ + to try the pre-trained models in your browser. You don't need to install + anything. + +icefall +------- + +This section lists models trained using `icefall`_. + +English +^^^^^^^ + + +icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using Common Voice 13.0 with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/997 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/yfyeung/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17 + cd icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17 + + git lfs pull --include "cpu_jit-epoch-60-avg-20.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit-epoch-60-avg-20.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + +.. _icefall-asr-librispeech-zipformer-2023-05-15: + +icefall-asr-librispeech-zipformer-2023-05-15 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # normal-scaled model, number of model parameters: 65549011, i.e., 65.55 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15 + cd icefall-asr-librispeech-zipformer-2023-05-15 + + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. _icefall-asr-librispeech-zipformer-small-2023-05-16: + +icefall-asr-librispeech-zipformer-small-2023-05-16 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # small-scaled model, number of model parameters: 23285615, i.e., 23.3 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16 + cd icefall-asr-librispeech-zipformer-small-2023-05-16 + + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + +.. _icefall-asr-librispeech-zipformer-large-2023-05-16: + +icefall-asr-librispeech-zipformer-large-2023-05-16 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # large-scaled model, number of model parameters: 148439574, i.e., 148.4 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16 + cd icefall-asr-librispeech-zipformer-large-2023-05-16 + + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. _icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04: + +icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using GigaSpeech + LibriSpeech + Common Voice 13.0 with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/1010 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 + cd icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 + git lfs pull --include "exp/cpu_jit-epoch-30-avg-4.pt" + cd exp + ln -s cpu_jit-epoch-30-avg-4.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + +.. _icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02: + +icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using GigaSpeech + LibriSpeech with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/728 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 + cd icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 + git lfs pull --include "exp/cpu_jit-torch-1.10.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + cd exp + rm cpu_jit.pt + ln -sv cpu_jit-torch-1.10.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using GigaSpeech + LibriSpeech with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/675 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 + cd icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/672 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 + cd icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 + git lfs pull --include "exp/cpu_jit-torch-1.10.0.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -s cpu_jit-torch-1.10.0.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + # This model is trained using LibriSpeech + GigaSpeech + # + # See https://github.com/k2-fsa/icefall/pull/363 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + +icefall-asr-gigaspeech-pruned-transducer-stateless2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + # This model is trained using GigaSpeech + # + # See https://github.com/k2-fsa/icefall/pull/318 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-gigaspeech-pruned-transducer-stateless2 + cd icefall-asr-gigaspeech-pruned-transducer-stateless2 + git lfs pull --include "exp/cpu_jit-iter-3488000-avg-15.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + + cd ../exp + ln -s cpu_jit-iter-3488000-avg-15.pt cpu_jit.pt + cd .. + + # Since this repo does not provide tokens.txt, we generate it from bpe.model + # by ourselves + /path/to/sherpa/scripts/bpe_model_to_tokens.py ./data/lang_bpe_500/bpe.model > ./data/lang_bpe_500/tokens.txt + + mkdir test_wavs + cd test_wavs + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1089-134686-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0002.wav + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + +Chinese +^^^^^^^ + +icefall-asr-zipformer-wenetspeech-20230615 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + # This model is trained using WenetSpeech + # + # See https://github.com/k2-fsa/icefall/pull/1130 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-wenetspeech-20230615 + + cd icefall-asr-zipformer-wenetspeech-20230615 + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_char/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_char/LG.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + + +icefall_asr_wenetspeech_pruned_transducer_stateless2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using WenetSpeech + # + # See https://github.com/k2-fsa/icefall/pull/349 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall_asr_wenetspeech_pruned_transducer_stateless2 + + cd icefall_asr_wenetspeech_pruned_transducer_stateless2 + git lfs pull --include "exp/cpu_jit_epoch_10_avg_2_torch_1.7.1.pt" + git lfs pull --include "data/lang_char/LG.pt" + cd exp + ln -s cpu_jit_epoch_10_avg_2_torch_1.7.1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_char/LG.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + +icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using aidatatang_200zh + # + # See https://github.com/k2-fsa/icefall/pull/355 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 + cd icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 + git lfs pull --include "exp/cpu_jit_torch.1.7.1.pt" + + cd exp + ln -sv cpu_jit_torch.1.7.1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/T0055G0036S0002.wav \ + ./test_wavs/T0055G0036S0003.wav \ + ./test_wavs/T0055G0036S0004.wav + done + +icefall-asr-alimeeting-pruned-transducer-stateless7 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using alimeeting (https://www.openslr.org/119/) + # + # See https://github.com/k2-fsa/icefall/pull/751 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7 + cd icefall-asr-alimeeting-pruned-transducer-stateless7 + + git lfs pull --include "exp/cpu_jit.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/165.wav \ + ./test_wavs/74.wav \ + ./test_wavs/209.wav + done + +Chinese + English +^^^^^^^^^^^^^^^^^ + +icefall_asr_tal-csasr_pruned_transducer_stateless5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using TAL_CSASR dataset from + # https://ai.100tal.com/dataset + # where each utterance contains both English and Chinese. + # + # See https://github.com/k2-fsa/icefall/pull/428 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall_asr_tal-csasr_pruned_transducer_stateless5 + cd icefall_asr_tal-csasr_pruned_transducer_stateless5 + git lfs pull --include "exp/cpu_jit.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_132.wav \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_138.wav \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_145.wav \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_148.wav + done + +Tibetan +^^^^^^^ + +icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using the XBMU-AMDO31 corpus + # + # See https://github.com/k2-fsa/icefall/pull/706 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 + cd icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav + +icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using the XBMU-AMDO31 corpus + # + # See https://github.com/k2-fsa/icefall/pull/706 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 + cd icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 + git lfs pull --include "data/lang_bpe_500/LG.pt" + git lfs pull --include "data/lang_bpe_500/tokens.txt" + git lfs pull --include "exp/cpu_jit-epoch-28-avg-23-torch-1.10.0.pt" + git lfs pull --include "test_wavs/a_0_cacm-A70_31116.wav" + git lfs pull --include "test_wavs/a_0_cacm-A70_31117.wav" + git lfs pull --include "test_wavs/a_0_cacm-A70_31118.wav" + + cd exp + rm cpu_jit.pt + ln -sv cpu_jit-epoch-28-avg-23-torch-1.10.0.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav diff --git a/docs/source/cpp/pretrained_models/online_transducer.rst b/docs/source/cpp/pretrained_models/online_transducer.rst new file mode 100644 index 000000000..65a136ff6 --- /dev/null +++ b/docs/source/cpp/pretrained_models/online_transducer.rst @@ -0,0 +1,396 @@ +.. _online_transducer_pretrained_models: + +Online transducer models +======================== + +.. hint:: + + We use the binary ``sherpa-online`` below for demonstration. + You can replace ``sherpa-online`` with ``sherpa-online-websocket-server`` + and ``sherpa-online-microphone``. + +.. hint:: + + At present, only streaming transducer models from `icefall`_ are supported. + +icefall +------- + +This section lists models trained using `icefall`_. + + +English +^^^^^^^ + +.. _icefall-asr-librispeech-streaming-zipformer-2023-05-17: + +icefall-asr-librispeech-streaming-zipformer-2023-05-17 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # normal-scaled model, number of model parameters: 66110931, i.e., 66.11 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17 + cd icefall-asr-librispeech-streaming-zipformer-2023-05-17 + + git lfs pull --include "exp/jit_script_chunk_16_left_128.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/jit_script_chunk_16_left_128.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script_chunk_16_left_128.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. _icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29: + +icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with streaming zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/787 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with ConvEmformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/440 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + + git lfs pull --include "exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -sv cpu-jit-epoch-30-avg-10-torch-1.10.0.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + + ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with LSTM transducer + # + # See https://github.com/k2-fsa/icefall/pull/558 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 + cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 + + git lfs pull --include "exp/encoder_jit_trace-iter-468000-avg-16.pt" + git lfs pull --include "exp/decoder_jit_trace-iter-468000-avg-16.pt" + git lfs pull --include "exp/joiner_jit_trace-iter-468000-avg-16.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + cd exp + ln -sv encoder_jit_trace-iter-468000-avg-16.pt encoder_jit_trace.pt + ln -sv decoder_jit_trace-iter-468000-avg-16.pt decoder_jit_trace.pt + ln -sv joiner_jit_trace-iter-468000-avg-16.pt joiner_jit_trace.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --encoder-model=./exp/encoder_jit_trace.pt \ + --decoder-model=./exp/decoder_jit_trace.pt \ + --joiner-model=./exp/joiner_jit_trace.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + sherpa-online \ + --decoding-method=fast_beam_search \ + --encoder-model=./exp/encoder_jit_trace.pt \ + --decoder-model=./exp/decoder_jit_trace.pt \ + --joiner-model=./exp/joiner_jit_trace.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with Emformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/390 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 + cd icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 + + git lfs pull --include "exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -sv cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + +icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with Conformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/440 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 + cd icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 + + git lfs pull --include "exp/cpu_jit-epoch-25-avg-3.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -sv cpu_jit-epoch-25-avg-3.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_waves/1089-134686-0001.wav \ + ./test_waves/1221-135766-0001.wav \ + ./test_waves/1221-135766-0002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_waves/1089-134686-0001.wav \ + ./test_waves/1221-135766-0001.wav \ + ./test_waves/1221-135766-0002.wav + +Chinese +^^^^^^^ + +icefall-asr-zipformer-wenetspeech-20230615 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using WenetSpeech + # + # See https://github.com/k2-fsa/icefall/pull/1130 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 + cd icefall-asr-zipformer-streaming-wenetspeech-20230615 + + git lfs pull --include exp/jit_script_chunk_16_left_128.pt + git lfs pull --include "data/lang_char/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/jit_script_chunk_16_left_128.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script_chunk_16_left_128.pt \ + --lg=./data/lang_char/LG.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + +icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using WenetSpeech with Conformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/447 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + cd icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + + git lfs pull --include "exp/cpu_jit_epoch_7_avg_1_torch.1.7.1.pt" + git lfs pull --include "data/lang_char/LG.pt" + cd exp + ln -sv cpu_jit_epoch_7_avg_1_torch.1.7.1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_char/LG.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + +Chinese + English (all-in-one) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pfluo/k2fsa-zipformer-chinese-english-mixed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is a `streaming zipformer model `_ + +.. code-block:: bash + + # This model supports both Chinese and English + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed + cd k2fsa-zipformer-chinese-english-mixed + git lfs pull --include "exp/cpu_jit.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char_bpe/tokens.txt \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav \ + ./test_wavs/3.wav \ + ./test_wavs/4.wav + done + +icefall-asr-conv-emformer-transducer-stateless2-zh +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is a `ConvEmformer model `_ + +.. code-block:: bash + + # This model supports both Chinese and English + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh + cd icefall-asr-conv-emformer-transducer-stateless2-zh + git lfs pull --include "exp/cpu_jit-epoch-11-avg-1.pt" + cd exp + ln -sv cpu_jit-epoch-11-avg-1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char_bpe/tokens.txt \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav \ + ./test_wavs/3.wav \ + ./test_wavs/4.wav + done diff --git a/docs/source/python/huggingface/index.rst b/docs/source/huggingface/index.rst similarity index 80% rename from docs/source/python/huggingface/index.rst rename to docs/source/huggingface/index.rst index 5e24baf5f..11a6f3d7c 100644 --- a/docs/source/python/huggingface/index.rst +++ b/docs/source/huggingface/index.rst @@ -1,10 +1,9 @@ .. _try sherpa with huggingface: -Try sherpa with Huggingface -=========================== +Run Next-gen Kaldi in your browser +================================== -This page describes how to use `sherpa`_ for automatic speech recognition -with `Huggingface`_. +This page describes how to try Next-gen Kaldi in your browser. .. hint:: @@ -15,8 +14,9 @@ The server is running on CPU within a docker container provided by `Huggingface`_ and you use a browser to interact with it. The browser can be run on Windows, macOS, Linux, or even on your phone or iPad. -You can either upload a file for recognition or record your speech via -a microphone from within the browser and submit it for recognition. +You can upload a file for recognition, record your speech via +a microphone from within the browser and submit it for recognition, or even +provider an URL to an audio file for speech recognition. Now let's get started. @@ -33,6 +33,12 @@ and you will see a page like the following screenshot: :alt: screenshot of ``_ :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + You can: 1. Select a language for recognition. Currently, we provide pre-trained models @@ -76,3 +82,9 @@ We provide the following YouTube video demonstrating how to use ``_ .. youtube:: ElN3r9dkKE4 + +Other Huggingface spaces +------------------------ + +- ASR + WebAssembly + sherpa-ncnn: Please see :ref:`try sherpa ncnn wasm with huggingface` +- TTS: Please see: ``_ diff --git a/docs/source/python/huggingface/pic/hugging-face-sherpa-2.png b/docs/source/huggingface/pic/hugging-face-sherpa-2.png similarity index 100% rename from docs/source/python/huggingface/pic/hugging-face-sherpa-2.png rename to docs/source/huggingface/pic/hugging-face-sherpa-2.png diff --git a/docs/source/python/huggingface/pic/hugging-face-sherpa-3.png b/docs/source/huggingface/pic/hugging-face-sherpa-3.png similarity index 100% rename from docs/source/python/huggingface/pic/hugging-face-sherpa-3.png rename to docs/source/huggingface/pic/hugging-face-sherpa-3.png diff --git a/docs/source/python/huggingface/pic/hugging-face-sherpa.png b/docs/source/huggingface/pic/hugging-face-sherpa.png similarity index 100% rename from docs/source/python/huggingface/pic/hugging-face-sherpa.png rename to docs/source/huggingface/pic/hugging-face-sherpa.png diff --git a/docs/source/index.rst b/docs/source/index.rst index 16c344131..25a408348 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,33 +6,37 @@ sherpa ====== -`sherpa`_ is a framework for streaming and non-streaming -automatic speech recognition (ASR). +.. toctree:: + :maxdepth: 2 -CPU-bound tasks, such as neural network computation, are implemented in -C++; while IO-bound tasks, such as socket communication, are implemented -in Python with `asyncio`_. + ./intro.rst + ./pdf.rst + ./social-groups.rst + ./huggingface/index.rst + ./pretrained-models.rst -Python is responsible for managing threads, which call into C++ extensions -with the `global interpreter lock (GIL) `_ -released so that multiple threads can run concurrently. -The following content describes how to install `sherpa`_ and its usage -for both streaming ASR and offline ASR (i.e., non-streaming ASR). +.. toctree:: + :maxdepth: 5 + :caption: k2-fsa/sherpa + + ./sherpa/index .. toctree:: - :maxdepth: 2 - :caption: For Python users + :maxdepth: 5 + :caption: k2-fsa/sherpa-ncnn - ./python/huggingface/index - ./python/installation/index - ./python/streaming_asr/index - ./python/offline_asr/index - ./python/faq + ./ncnn/index .. toctree:: - :maxdepth: 2 - :caption: For C++ users + :maxdepth: 5 + :caption: k2-fsa/sherpa-onnx + + ./onnx/index + +.. toctree:: + :maxdepth: 5 + :caption: Triton + + ./triton/overview - ./cpp/installation/index - ./cpp/offline_asr/index diff --git a/docs/source/intro.rst b/docs/source/intro.rst new file mode 100644 index 000000000..3c6f4a2cc --- /dev/null +++ b/docs/source/intro.rst @@ -0,0 +1,88 @@ +Introduction +============ + +`sherpa`_ is the deployment framework of the ``Next-gen Kaldi`` project. + +`sherpa`_ supports deploying speech related pre-trained models on various platforms +with various language bindings. + +If you are interested in how to train your own model or fine tune a pre-trained +model, please refer to `icefall`_. + +At present, `sherpa`_ has the following sub-projects: + + - `k2-fsa/sherpa`_ + - `k2-fsa/sherpa-onnx`_ + - `k2-fsa/sherpa-ncnn`_ + + +The differences are compared below: + +.. list-table:: + + * - **** + - `k2-fsa/sherpa`_ + - `k2-fsa/sherpa-onnx`_ + - `k2-fsa/sherpa-ncnn`_ + * - Installation difficulty + - **hard** + - ``easy`` + - ``easy`` + * - NN lib + - `PyTorch`_ + - `onnxruntime`_ + - `ncnn`_ + * - CPU Support + - x86, x86_64 + - | x86, x86_64, + | ``arm32``, ``arm64`` + - | x86, x86_64, + | ``arm32``, ``arm64``, + | ``**RISC-V**`` + * - GPU Support + - | Yes + | (with ``CUDA`` for NVIDIA GPUs) + - Yes + - | Yes + | (with ``Vulkan`` for ARM GPUs) + * - OS Support + - | Linux, Windows, + | macOS + - | Linux, Windows, + | macOS, ``iOS``, + | ``Android`` + - | Linux, Windows, + | macOS, ``iOS``, + | ``Android`` + * - Support batch_size > 1 + - Yes + - Yes + - ``No`` + * - Provided APIs + - C++, Python + - | C, C++, Python, + | C#, Java, Kotlin, + | Swift, Go, + | JavaScript, Dart + | Pascal, Rust + - | C, C++, Python, + | C#, Kotlin, + | Swift, Go + * - Supported functions + - | streaming speech recognition, + | non-streaming speech recognition + - | streaming speech recognition, + | non-streaming speech recognition, + | text-to-speech, + | speaker diarization, + | speaker identification, + | speaker verification, + | spoken language identification, + | audio tagging, + | VAD, + | keyword spotting, + - | streaming speech recognition, + | VAD, + + +We also support `Triton`_. Please see :ref:`triton_overview`. diff --git a/docs/source/ncnn/android/build-sherpa-ncnn.rst b/docs/source/ncnn/android/build-sherpa-ncnn.rst new file mode 100644 index 000000000..109d71686 --- /dev/null +++ b/docs/source/ncnn/android/build-sherpa-ncnn.rst @@ -0,0 +1,385 @@ +.. _sherpa-ncnn-install-android-studio: + +Build sherpa-ncnn for Android +============================= + +Install Android Studio +---------------------- + +The first step is to download and install Android Studio. + +Please refer to ``_ for how to install +Android Studio. + +.. hint:: + + Any recent version of Android Studio should work fine. Also, you can use + the default settings of Android Studio during installation. + + For reference, we post the version we are using below: + + .. image:: ./pic/android-studio-version.png + :alt: screenshot of my version of Android Studio + :width: 600 + + +Download sherpa-ncnn +-------------------- + +Next, download the source code of `sherpa-ncnn`_: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + + +Install NDK +----------- + +Step 1, start Android Studio. + + .. figure:: ./pic/start-android-studio.png + :alt: Start Android Studio + :width: 600 + + Step 1: Click ``Open`` to select ``sherpa-ncnn/android/SherpaNcnn`` + +Step 2, Open ``sherpa-ncnn/android/SherpaNcnn``. + + .. figure:: ./pic/open-sherpa-ncnn.png + :alt: Open SherpaNCNN + :width: 600 + + Step 2: Open ``SherpaNcnn``. + + +Step 3, Select ``Tools -> SDK Manager``. + + .. figure:: ./pic/select-sdk-manager.png + :alt: Select Tools -> SDK Manager + :width: 600 + + Step 3: Select ``Tools -> SDK Manager``. + +Step 4, ``Install NDK``. + + .. figure:: ./pic/ndk-tools.png + :alt: Install NDK + :width: 600 + + Step 4: Install NDK. + +In the following, we assume ``Android SDK location`` was set to +``/Users/fangjun/software/my-android``. You can change it accordingly below. + +After installing NDK, you can find it in + +.. code-block:: + + /Users/fangjun/software/my-android/ndk/22.1.7171670 + +.. warning:: + + If you selected a different version of NDK, please replace ``22.1.7171670`` + accordingly. + +Next, let us set the environment variable ``ANDROID_NDK`` for later use. + +.. code-block:: bash + + export ANDROID_NDK=/Users/fangjun/software/my-android/ndk/22.1.7171670 + +.. note:: + + Note from https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-android + + (Important) remove the hardcoded debug flag in Android NDK to fix + the android-ndk issue: https://github.com/android/ndk/issues/243 + + 1. open ``$ANDROID_NDK/build/cmake/android.toolchain.cmake`` for ndk < r23 + or ``$ANDROID_NDK/build/cmake/android-legacy.toolchain.cmake`` for ndk >= r23 + + 2. delete the line containing "-g" + + .. code-block:: + + list(APPEND ANDROID_COMPILER_FLAGS + -g + -DANDROID + +.. caution:: + + If you don't delete the line containin ``-g`` above, the generated + library ``libncnn.so`` can be as large as ``21 MB`` or even larger! + +Build sherpa-ncnn (C++ code) +---------------------------- + +After installing ``NDK``, it is time to build the C++ code of `sherpa-ncnn`_. + +In the following, we show how to build `sherpa-ncnn`_ for the following +Android ABIs: + + - ``arm64-v8a`` + - ``armeabi-v7a`` + - ``x86_64`` + - ``x86`` + +.. caution:: + + You only need to select one and only one ABI. ``arm64-v8a`` is probably the + most common one. + + If you want to test the app on an emulator, you probably need ``x86_64``. + +.. hint:: + + Building scripts for this section are for macOS and Linux. If you are + using Windows or if you don't want to build the shared libraries by yourself, + you can download pre-compiled shared libraries for this section by visiting + + ``_ + +.. hint:: + + We provide a colab notebook + |build sherpa-ncnn for android colab notebook| + for you to try this section step by step. + + If you are using Windows or you don't want to setup your local environment + to build the C++ libraries, please use the above colab notebook. + +.. |build sherpa-ncnn for android colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-ncnn/build_sherpa_ncnn_for_android.ipynb + +Build for arm64-v8a +^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-ncnn # Go to the root repo + ./build-android-arm64-v8a.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + $ ls -lh build-android-arm64-v8a/install/lib/lib*.so + -rwxr-xr-x 1 fangjun staff 848K Dec 18 16:49 build-android-arm64-v8a/install/lib/libkaldi-native-fbank-core.so + -rwxr-xr-x 1 fangjun staff 3.4M Dec 18 16:49 build-android-arm64-v8a/install/lib/libncnn.so + -rwxr-xr-x 1 fangjun staff 195K Dec 18 16:49 build-android-arm64-v8a/install/lib/libsherpa-ncnn-core.so + -rwxr-xr-x 1 fangjun staff 19K Dec 18 16:49 build-android-arm64-v8a/install/lib/libsherpa-ncnn-jni.so + +Please copy them to ``android/SherpaNcnn/app/src/main/jniLibs/arm64-v8a/``: + +.. code-block:: bash + + $ cp build-android-arm64-v8a/install/lib/lib*.so android/SherpaNcnn/app/src/main/jniLibs/arm64-v8a/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-arm64-v8a.png + :alt: Generated shared libraries for arm64-v8a + :width: 600 + +.. note:: + + If you have ``Android >= 7.0`` and want to run `sherpa-ncnn`_ on GPU, please replace + ``./build-android-arm64-v8a.sh`` with ``./build-android-arm64-v8a-with-vulkan.sh`` + and replace ``build-android-arm64-v8a/install/lib/lib*.so`` with + ``./build-android-arm64-v8a-with-vulkan/install/lib/lib*.so``. That is all + you need to do and you don't need to change any code. + + + Also, you need to install Vulkan sdk. Please see + ``_ + for details. + + + +Build for armeabi-v7a +^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-ncnn # Go to the root repo + ./build-android-armv7-eabi.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + $ ls -lh build-android-armv7-eabi/install/lib/lib*.so + -rwxr-xr-x 1 fangjun staff 513K Dec 18 17:04 build-android-armv7-eabi/install/lib/libkaldi-native-fbank-core.so + -rwxr-xr-x 1 fangjun staff 1.9M Dec 18 17:04 build-android-armv7-eabi/install/lib/libncnn.so + -rwxr-xr-x 1 fangjun staff 163K Dec 18 17:04 build-android-armv7-eabi/install/lib/libsherpa-ncnn-core.so + -rwxr-xr-x 1 fangjun staff 28K Dec 18 17:04 build-android-armv7-eabi/install/lib/libsherpa-ncnn-jni.so + +Please copy them to ``android/SherpaNcnn/app/src/main/jniLibs/armeabi-v7a/``: + +.. code-block:: bash + + cp build-android-armv7-eabi/install/lib/lib*.so android/SherpaNcnn/app/src/main/jniLibs/armeabi-v7a/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-armeabi-v7a.png + :alt: Generated shared libraries for armeabi-v7a + :width: 600 + +Build for x86_64 +^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-ncnn # Go to the root repo + ./build-android-x86-64.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + $ ls -lh build-android-x86-64/install/lib/lib*.so + -rwxr-xr-x 1 fangjun staff 901K Dec 18 17:14 build-android-x86-64/install/lib/libkaldi-native-fbank-core.so + -rwxr-xr-x 1 fangjun staff 6.9M Dec 18 17:14 build-android-x86-64/install/lib/libncnn.so + -rwxr-xr-x 1 fangjun staff 208K Dec 18 17:14 build-android-x86-64/install/lib/libsherpa-ncnn-core.so + -rwxr-xr-x 1 fangjun staff 19K Dec 18 17:14 build-android-x86-64/install/lib/libsherpa-ncnn-jni.so + +Please copy them to ``android/SherpaNcnn/app/src/main/jniLibs/x86_64/``: + +.. code-block:: bash + + cp build-android-x86-64/install/lib/lib*.so android/SherpaNcnn/app/src/main/jniLibs/x86_64/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-x86-64.png + :alt: Generated shared libraries for x86_64 + :width: 600 + +Build for x86 +^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-ncnn # Go to the root repo + ./build-android-x86.sh + +Download pre-trained models +--------------------------- + +Please read :ref:`sherpa-ncnn-pre-trained-models` for all available pre-trained +models. + +In the following, we use a pre-trained model from +``_, +which supports both Chinese and English. + +.. hint:: + + The model is trained using `icefall`_ and the original torchscript model + is from ``_. + +Use the following command to download the pre-trained model and place it into +``android/SherpaNcnn/app/src/main/assets/``: + +.. code-block:: bash + + cd android/SherpaNcnn/app/src/main/assets/ + + sudo apt-get install git-lfs + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-ncnn-conv-emformer-transducer-2022-12-06 + cd sherpa-ncnn-conv-emformer-transducer-2022-12-06 + git lfs pull --include "*.bin" + + # Now, remove extra files to reduce the file size of the generated apk + rm -rf .git test_wavs scripts/ + rm export-for-ncnn.sh *.png README.md + +In the end, you should have the following files: + +.. code-block:: bash + + $ ls -lh + total 525224 + -rw-r--r-- 1 fangjun staff 5.9M Dec 18 17:40 decoder_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 fangjun staff 439B Dec 18 17:39 decoder_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 fangjun staff 141M Dec 18 17:40 encoder_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 fangjun staff 99M Dec 18 17:40 encoder_jit_trace-pnnx.ncnn.int8.bin + -rw-r--r-- 1 fangjun staff 78K Dec 18 17:40 encoder_jit_trace-pnnx.ncnn.int8.param + -rw-r--r-- 1 fangjun staff 79K Dec 18 17:39 encoder_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 fangjun staff 6.9M Dec 18 17:40 joiner_jit_trace-pnnx.ncnn.bin + -rw-r--r-- 1 fangjun staff 3.5M Dec 18 17:40 joiner_jit_trace-pnnx.ncnn.int8.bin + -rw-r--r-- 1 fangjun staff 498B Dec 18 17:40 joiner_jit_trace-pnnx.ncnn.int8.param + -rw-r--r-- 1 fangjun staff 490B Dec 18 17:39 joiner_jit_trace-pnnx.ncnn.param + -rw-r--r-- 1 fangjun staff 53K Dec 18 17:39 tokens.txt + + $ du -h -d1 . + 256M . + +You should see the following screen shot after downloading the pre-trained model: + +.. figure:: ./pic/pre-trained-model-2022-12-06.png + :alt: Files after downloading the pre-trained model + :width: 600 + +.. hint:: + + If you select a different pre-trained model, make sure that you also change the + corresponding code listed in the following screen shot: + + .. figure:: ./pic/type-for-pre-trained-model-2022-12-06.png + :alt: Change code if you select a different model + :width: 600 + +Generate APK +------------ + +Finally, it is time to build `sherpa-ncnn`_ to generate an APK package. + +Select ``Build -> Make Project``, as shown in the following screen shot. + +.. figure:: ./pic/build-make-project.png + :alt: Select ``Build -> Make Project`` + :width: 600 + +You can find the generated APK in ``android/SherpaNcnn/app/build/outputs/apk/debug/app-debug.apk``: + +.. code-block:: bash + + $ ls -lh android/SherpaNcnn/app/build/outputs/apk/debug/app-debug.apk + -rw-r--r-- 1 fangjun staff 152M Dec 18 17:53 android/SherpaNcnn/app/build/outputs/apk/debug/app-debug.apk + +Congratulations! You have successfully built an APK for Android. + +Read below to learn more. + +.. _sherpa-ncnn-analyze-apk-result: + +Analyze the APK +--------------- + +.. figure:: ./pic/analyze-apk.png + :alt: Select ``Build -> Analyze APK ...`` + :width: 600 + +Select ``Build -> Analyze APK ...`` in the above screen shot, in the +popped-up dialog select the generated APK ``app-debug.apk``, +and you will see the following screen shot: + +.. figure:: ./pic/analyze-apk-result.png + :alt: Result of analyzing apk + :width: 700 + +You can see from the above screen shot that most part of the APK +is occupied by the pre-trained model, while the runtime, including the shared +libraries, is only ``1.7 MB``. + +.. hint:: + + We have pre-built APKs that can be downloaded from + ``_ + + Please refer to demo videos about using the above + APKs: :ref:`sherpa-ncnn-android-video-demos`. diff --git a/docs/source/ncnn/android/demo-videos.rst b/docs/source/ncnn/android/demo-videos.rst new file mode 100644 index 000000000..4e53c0480 --- /dev/null +++ b/docs/source/ncnn/android/demo-videos.rst @@ -0,0 +1,54 @@ +.. _sherpa-ncnn-android-video-demos: + +Video demos +=========== + +In this page, we list some videos about using `sherpa-ncnn`_ for +real-time speech recognition on Android. + +.. hint:: + + You can find pre-built ``APK`` packages used by the following videos at: + + ``_ + + - CPU versions require ``Android >= 5.0`` + - GPU versions with ``Vulkan`` require ``Android >= 7.0`` + +.. note:: + + You can also find the latest APK for each release at + + ``_ + + + +Video 1: Chinese +---------------- + +.. raw:: html + + + + +Video 2: Chinese + English +-------------------------- + +.. raw:: html + + + + +Video 3: Chinese with background noise +-------------------------------------- + +.. raw:: html + + + +Video 4: Chinese poem with background music +------------------------------------------- + +.. raw:: html + + diff --git a/docs/source/ncnn/android/index.rst b/docs/source/ncnn/android/index.rst new file mode 100644 index 000000000..107be0b9c --- /dev/null +++ b/docs/source/ncnn/android/index.rst @@ -0,0 +1,19 @@ +.. _sherpa-ncnn-android: + +Android +======= + +In this section, we describe how to build an Android app for ``real-time`` speech +recognition with `sherpa-ncnn`_. We also provide real-time speech recognition +video demos. + +.. hint:: + + During speech recognition, it does not need to access the Internet. + Everyting is processed locally on your phone. + +.. toctree:: + :maxdepth: 2 + + demo-videos + build-sherpa-ncnn diff --git a/docs/source/ncnn/android/pic/analyze-apk-result.png b/docs/source/ncnn/android/pic/analyze-apk-result.png new file mode 100644 index 000000000..23a701036 Binary files /dev/null and b/docs/source/ncnn/android/pic/analyze-apk-result.png differ diff --git a/docs/source/ncnn/android/pic/analyze-apk.png b/docs/source/ncnn/android/pic/analyze-apk.png new file mode 100644 index 000000000..81d8e2edd Binary files /dev/null and b/docs/source/ncnn/android/pic/analyze-apk.png differ diff --git a/docs/source/ncnn/android/pic/android-studio-version.png b/docs/source/ncnn/android/pic/android-studio-version.png new file mode 100644 index 000000000..d2f25efee Binary files /dev/null and b/docs/source/ncnn/android/pic/android-studio-version.png differ diff --git a/docs/source/ncnn/android/pic/build-make-project.png b/docs/source/ncnn/android/pic/build-make-project.png new file mode 100644 index 000000000..80668fdd6 Binary files /dev/null and b/docs/source/ncnn/android/pic/build-make-project.png differ diff --git a/docs/source/ncnn/android/pic/ndk-tools.png b/docs/source/ncnn/android/pic/ndk-tools.png new file mode 100644 index 000000000..bd83f0a69 Binary files /dev/null and b/docs/source/ncnn/android/pic/ndk-tools.png differ diff --git a/docs/source/ncnn/android/pic/open-sherpa-ncnn.png b/docs/source/ncnn/android/pic/open-sherpa-ncnn.png new file mode 100644 index 000000000..5a6691118 Binary files /dev/null and b/docs/source/ncnn/android/pic/open-sherpa-ncnn.png differ diff --git a/docs/source/ncnn/android/pic/pre-trained-model-2022-12-06.png b/docs/source/ncnn/android/pic/pre-trained-model-2022-12-06.png new file mode 100644 index 000000000..f089f45c4 Binary files /dev/null and b/docs/source/ncnn/android/pic/pre-trained-model-2022-12-06.png differ diff --git a/docs/source/ncnn/android/pic/select-sdk-manager.png b/docs/source/ncnn/android/pic/select-sdk-manager.png new file mode 100644 index 000000000..76651563c Binary files /dev/null and b/docs/source/ncnn/android/pic/select-sdk-manager.png differ diff --git a/docs/source/ncnn/android/pic/so-libs-for-arm64-v8a.png b/docs/source/ncnn/android/pic/so-libs-for-arm64-v8a.png new file mode 100644 index 000000000..37ba75c82 Binary files /dev/null and b/docs/source/ncnn/android/pic/so-libs-for-arm64-v8a.png differ diff --git a/docs/source/ncnn/android/pic/so-libs-for-armeabi-v7a.png b/docs/source/ncnn/android/pic/so-libs-for-armeabi-v7a.png new file mode 100644 index 000000000..a5a76f93d Binary files /dev/null and b/docs/source/ncnn/android/pic/so-libs-for-armeabi-v7a.png differ diff --git a/docs/source/ncnn/android/pic/so-libs-for-x86-64.png b/docs/source/ncnn/android/pic/so-libs-for-x86-64.png new file mode 100644 index 000000000..a3c94b9b0 Binary files /dev/null and b/docs/source/ncnn/android/pic/so-libs-for-x86-64.png differ diff --git a/docs/source/ncnn/android/pic/start-android-studio.png b/docs/source/ncnn/android/pic/start-android-studio.png new file mode 100644 index 000000000..b683d3ff7 Binary files /dev/null and b/docs/source/ncnn/android/pic/start-android-studio.png differ diff --git a/docs/source/ncnn/android/pic/type-for-pre-trained-model-2022-12-06.png b/docs/source/ncnn/android/pic/type-for-pre-trained-model-2022-12-06.png new file mode 100644 index 000000000..e17bae3a3 Binary files /dev/null and b/docs/source/ncnn/android/pic/type-for-pre-trained-model-2022-12-06.png differ diff --git a/docs/source/ncnn/c-api/index.rst b/docs/source/ncnn/c-api/index.rst new file mode 100644 index 000000000..8684d3b2d --- /dev/null +++ b/docs/source/ncnn/c-api/index.rst @@ -0,0 +1,162 @@ +.. _sherpa-ncnn-c-api: + +C API +===== + +In this section, we describe how to use the C API of `sherpa-ncnn`_. + +Specifically, we will describe: + + - How to generate required files + - How to use ``pkg-config`` with `sherpa-ncnn`_ + +Generate required files +----------------------- + +Before using the C API of `sherpa-ncnn`_, we need to first build required +libraries. You can choose either to build static libraries or shared libraries. + +Build shared libraries +^^^^^^^^^^^^^^^^^^^^^^ + +Assume that we want to put library files and header files in the directory +``/tmp/sherpa-ncnn/shared``: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build-shared + cd build-shared + + cmake \ + -DSHERPA_NCNN_ENABLE_C_API=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_INSTALL_PREFIX=/tmp/sherpa-ncnn/shared \ + .. + + make -j6 + make install + +You should find the following files inside ``/tmp/sherpa-ncnn/shared``: + +.. tabs:: + + .. tab:: macOS + + .. code-block:: bash + + $ tree /tmp/sherpa-ncnn/shared/ + /tmp/sherpa-ncnn/shared/ + ├── bin + │   ├── sherpa-ncnn + │   └── sherpa-ncnn-microphone + ├── include + │   └── sherpa-ncnn + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libkaldi-native-fbank-core.dylib + │   ├── libncnn.dylib + │   ├── libsherpa-ncnn-c-api.dylib + │   └── libsherpa-ncnn-core.dylib + └── sherpa-ncnn.pc + + 5 directories, 8 files + + .. tab:: Linux + + .. code-block:: bash + + $ tree /tmp/sherpa-ncnn/shared/ + /tmp/sherpa-ncnn/shared/ + ├── bin + │   ├── sherpa-ncnn + │   └── sherpa-ncnn-microphone + ├── include + │   └── sherpa-ncnn + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libkaldi-native-fbank-core.so + │   ├── libncnn.so + │   ├── libsherpa-ncnn-c-api.so + │   └── libsherpa-ncnn-core.so + └── sherpa-ncnn.pc + + 5 directories, 8 files + +Build static libraries +^^^^^^^^^^^^^^^^^^^^^^ + +Assume that we want to put library files and header files in the directory +``/tmp/sherpa-ncnn/static``: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build-static + cd build-static + + cmake \ + -DSHERPA_NCNN_ENABLE_C_API=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_INSTALL_PREFIX=/tmp/sherpa-ncnn/static \ + .. + + make -j6 + make install + +You should find the following files in ``/tmp/sherpa-ncnn/static``: + +.. code-block:: bash + + $ tree /tmp/sherpa-ncnn/static/ + /tmp/sherpa-ncnn/static/ + ├── bin + │   ├── sherpa-ncnn + │   └── sherpa-ncnn-microphone + ├── include + │   └── sherpa-ncnn + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libkaldi-native-fbank-core.a + │   ├── libncnn.a + │   ├── libsherpa-ncnn-c-api.a + │   └── libsherpa-ncnn-core.a + └── sherpa-ncnn.pc + + 5 directories, 8 files + +Build decode-file-c-api.c with generated files +---------------------------------------------- + +To build the following file: + + ``_ + +We can use: + +.. tabs:: + + .. tab:: static link + + .. code-block:: bash + + export PKG_CONFIG_PATH=/tmp/sherpa-ncnn/static:$PKG_CONFIG_PATH + + cd ./c-api-examples + gcc -o decode-file-c-api $(pkg-config --cflags sherpa-ncnn) ./decode-file-c-api.c $(pkg-config --libs sherpa-ncnn) + + .. tab:: dynamic link + + .. code-block:: bash + + export PKG_CONFIG_PATH=/tmp/sherpa-ncnn/shared:$PKG_CONFIG_PATH + + cd ./c-api-examples + gcc -o decode-file-c-api $(pkg-config --cflags sherpa-ncnn) ./decode-file-c-api.c $(pkg-config --libs sherpa-ncnn) diff --git a/docs/source/ncnn/endpoint.rst b/docs/source/ncnn/endpoint.rst new file mode 100644 index 000000000..ef0385a8f --- /dev/null +++ b/docs/source/ncnn/endpoint.rst @@ -0,0 +1,122 @@ +Endpointing +=========== + +We have three rules for endpoint detection. If any of them is activated, +we assume an endpoint is detected. + +.. note:: + + We borrow the implementation from + + ``_ + +Rule 1 +------ + +In ``Rule 1``, we count the duration of trailing silence. If it is larger than +a user specified value, ``Rule 1`` is activated. The following is an example, +which uses ``2.4 seconds`` as the threshold. + + .. figure:: ./pic/rule1.png + :alt: Rule 1 for endpoint detection + :width: 600 + +Two cases are given: + +(1) In the first case, nothing has been decoded when the duration of trailing + silence reaches 2.4 seconds. + +(2) In the second case, we first decode something before the duration of + trailing silence reaches 2.4 seconds. + +In both cases, ``Rule 1`` is activated. + +.. hint:: + + In the Python API, you can specify ``rule1_min_trailing_silence`` while + constructing an instance of ``sherpa_ncnn.Recognizer``. + + In the C++ API, you can specify ``rule1.min_trailing_silence`` when creating + ``EndpointConfig``. + + +Rule 2 +------ + +In ``Rule 2``, we require that it has to first decode something +before we count the trailing silence. In the following example, after decoding +something, ``Rule 2`` is activated when the duration of trailing silence is +larger than the user specified value ``1.2`` seconds. + + .. figure:: ./pic/rule2.png + :alt: Rule 2 for endpoint detection + :width: 600 + +.. hint:: + + In the Python API, you can specify ``rule2_min_trailing_silence`` while + constructing an instance of ``sherpa_ncnn.Recognizer``. + + In the C++ API, you can specify ``rule2.min_trailing_silence`` when creating + ``EndpointConfig``. + +Rule 3 +------ + +``Rule 3`` is activated when the utterance length in seconds is larger than +a given value. In the following example, ``Rule 3`` is activated after the +first segment reaches a given value, which is ``20`` seconds in this case. + + .. figure:: ./pic/rule3.png + :alt: Rule 3 for endpoint detection + :width: 600 + +.. hint:: + + In the Python API, you can specify ``rule3_min_utterance_length`` while + constructing an instance of ``sherpa_ncnn.Recognizer``. + + In the C++ API, you can specify ``rule3.min_utterance_length`` when creating + ``EndpointConfig``. + +.. note:: + + If you want to deactive this rule, please provide a very large value + for ``rule3_min_utterance_length`` or ``rule3.min_utterance_length``. + +Demo +---- + +Multilingual (Chinese + English) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following video demonstrates using the Python API of `sherpa-ncnn`_ +for real-time speech recogntinion with endpointing. + +.. raw:: html + + + + +.. hint:: + + The code is available at + + ``_ + +FAQs +---- + +How to compute duration of silence +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For each frame to be decoded, we can output either a blank or a non-blank token. +We record the number of contiguous blanks that has been decoded so far. +In the current default setting, each frame is ``10 ms``. Thus, we can get +the duration of trailing silence by counting the number of contiguous trailing +blanks. + +.. note:: + + If a model uses a subsampling factor of 4, the time resolution becomes + ``10 * 4 = 40 ms``. diff --git a/docs/source/ncnn/examples/index.rst b/docs/source/ncnn/examples/index.rst new file mode 100644 index 000000000..0c9909ada --- /dev/null +++ b/docs/source/ncnn/examples/index.rst @@ -0,0 +1,14 @@ +Examples +======== + +In this section, we describe some usage examples of `sherpa-ncnn`_ on various +boards. + +.. toctree:: + :maxdepth: 2 + + ./raspberry-pi-3.rst + ./jetson-nano.rst + ./jetson-nx.rst + ./vision-five-2.rst + diff --git a/docs/source/ncnn/examples/jetson-nano.rst b/docs/source/ncnn/examples/jetson-nano.rst new file mode 100644 index 000000000..d485b2bc8 --- /dev/null +++ b/docs/source/ncnn/examples/jetson-nano.rst @@ -0,0 +1,25 @@ +Jetson Nano +=========== + +This page posts some screenshots of running `sherpa-ncnn`_ on `Jetson Nano `_. + +.. hint:: + + You can find pre-compiled binaries used in this example at + + ``_ + +Board info +---------- + + .. image:: ./pic/jetson-nano/lscpu2.jpg + :alt: Output of lscpu + :width: 600 + + +RTF (4 threads) +--------------- + + .. image:: ./pic/jetson-nano/rtf-4-threads.jpg + :alt: RTF for 4 threads + :width: 600 diff --git a/docs/source/ncnn/examples/jetson-nx.rst b/docs/source/ncnn/examples/jetson-nx.rst new file mode 100644 index 000000000..716fed17f --- /dev/null +++ b/docs/source/ncnn/examples/jetson-nx.rst @@ -0,0 +1,38 @@ +Jetson NX +========= + +This page posts some screenshots of running `sherpa-ncnn`_ on ``Jetson NX``. + +.. hint:: + + You can find pre-compiled binaries used in this example at + + ``_ + +Board info +---------- + + .. image:: ./pic/jetson-nx/lscpu2.jpg + :alt: Output of lscpu + :width: 600 + +RTF (2 threads) +--------------- + + .. image:: ./pic/jetson-nx/rtf-2-threads.jpg + :alt: RTF for 2 threads + :width: 600 + +RTF (4 threads) +--------------- + + .. image:: ./pic/jetson-nx/rtf-4-threads.jpg + :alt: RTF for 4 threads + :width: 600 + +RTF (6 threads) +--------------- + + .. image:: ./pic/jetson-nx/rtf-6-threads.jpg + :alt: RTF for 6 threads + :width: 600 diff --git a/docs/source/ncnn/examples/pic/.gitignore b/docs/source/ncnn/examples/pic/.gitignore new file mode 100644 index 000000000..b5a379de9 --- /dev/null +++ b/docs/source/ncnn/examples/pic/.gitignore @@ -0,0 +1 @@ +!*.jpg diff --git a/docs/source/ncnn/examples/pic/jetson-nano/lscpu2.jpg b/docs/source/ncnn/examples/pic/jetson-nano/lscpu2.jpg new file mode 100644 index 000000000..a310c5959 Binary files /dev/null and b/docs/source/ncnn/examples/pic/jetson-nano/lscpu2.jpg differ diff --git a/docs/source/ncnn/examples/pic/jetson-nano/rtf-4-threads.jpg b/docs/source/ncnn/examples/pic/jetson-nano/rtf-4-threads.jpg new file mode 100644 index 000000000..644fc67f3 Binary files /dev/null and b/docs/source/ncnn/examples/pic/jetson-nano/rtf-4-threads.jpg differ diff --git a/docs/source/ncnn/examples/pic/jetson-nx/lscpu2.jpg b/docs/source/ncnn/examples/pic/jetson-nx/lscpu2.jpg new file mode 100644 index 000000000..c16173c0e Binary files /dev/null and b/docs/source/ncnn/examples/pic/jetson-nx/lscpu2.jpg differ diff --git a/docs/source/ncnn/examples/pic/jetson-nx/rtf-2-threads.jpg b/docs/source/ncnn/examples/pic/jetson-nx/rtf-2-threads.jpg new file mode 100644 index 000000000..c359e303e Binary files /dev/null and b/docs/source/ncnn/examples/pic/jetson-nx/rtf-2-threads.jpg differ diff --git a/docs/source/ncnn/examples/pic/jetson-nx/rtf-4-threads.jpg b/docs/source/ncnn/examples/pic/jetson-nx/rtf-4-threads.jpg new file mode 100644 index 000000000..6ccf3893a Binary files /dev/null and b/docs/source/ncnn/examples/pic/jetson-nx/rtf-4-threads.jpg differ diff --git a/docs/source/ncnn/examples/pic/jetson-nx/rtf-6-threads.jpg b/docs/source/ncnn/examples/pic/jetson-nx/rtf-6-threads.jpg new file mode 100644 index 000000000..a37b80fa6 Binary files /dev/null and b/docs/source/ncnn/examples/pic/jetson-nx/rtf-6-threads.jpg differ diff --git a/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_1.png b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_1.png new file mode 100644 index 000000000..41eda75c4 Binary files /dev/null and b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_1.png differ diff --git a/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_2.png b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_2.png new file mode 100644 index 000000000..a8b17117f Binary files /dev/null and b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_2.png differ diff --git a/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_4.png b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_4.png new file mode 100644 index 000000000..e572ebab2 Binary files /dev/null and b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_4.png differ diff --git a/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_5.png b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_5.png new file mode 100644 index 000000000..9cd1970a3 Binary files /dev/null and b/docs/source/ncnn/examples/pic/raspberry-pi-3/raspberrypi_3b_shot_5.png differ diff --git a/docs/source/ncnn/examples/pic/raspberry-pi-3/rtf-1-thread.jpg b/docs/source/ncnn/examples/pic/raspberry-pi-3/rtf-1-thread.jpg new file mode 100644 index 000000000..518a4c712 Binary files /dev/null and b/docs/source/ncnn/examples/pic/raspberry-pi-3/rtf-1-thread.jpg differ diff --git a/docs/source/ncnn/examples/pic/raspberry-pi-3/rtf-2-threads.jpg b/docs/source/ncnn/examples/pic/raspberry-pi-3/rtf-2-threads.jpg new file mode 100644 index 000000000..51262906e Binary files /dev/null and b/docs/source/ncnn/examples/pic/raspberry-pi-3/rtf-2-threads.jpg differ diff --git a/docs/source/ncnn/examples/pic/vision-five-2/arecord.jpg b/docs/source/ncnn/examples/pic/vision-five-2/arecord.jpg new file mode 100644 index 000000000..39b97b1a6 Binary files /dev/null and b/docs/source/ncnn/examples/pic/vision-five-2/arecord.jpg differ diff --git a/docs/source/ncnn/examples/pic/vision-five-2/lscpu.jpg b/docs/source/ncnn/examples/pic/vision-five-2/lscpu.jpg new file mode 100644 index 000000000..6c40b5949 Binary files /dev/null and b/docs/source/ncnn/examples/pic/vision-five-2/lscpu.jpg differ diff --git a/docs/source/ncnn/examples/pic/vision-five-2/microphone-alsa.jpg b/docs/source/ncnn/examples/pic/vision-five-2/microphone-alsa.jpg new file mode 100644 index 000000000..b435e4f35 Binary files /dev/null and b/docs/source/ncnn/examples/pic/vision-five-2/microphone-alsa.jpg differ diff --git a/docs/source/ncnn/examples/pic/vision-five-2/rtf.jpg b/docs/source/ncnn/examples/pic/vision-five-2/rtf.jpg new file mode 100644 index 000000000..7793592df Binary files /dev/null and b/docs/source/ncnn/examples/pic/vision-five-2/rtf.jpg differ diff --git a/docs/source/ncnn/examples/raspberry-pi-3.rst b/docs/source/ncnn/examples/raspberry-pi-3.rst new file mode 100644 index 000000000..e515d3fb1 --- /dev/null +++ b/docs/source/ncnn/examples/raspberry-pi-3.rst @@ -0,0 +1,52 @@ +Raspberry Pi 3B E14 +=================== + +This page posts some screenshots of running `sherpa-ncnn`_ on Raspberry Pi 3B E14. + +.. hint:: + + You can find pre-compiled binaries used in this example at + + ``_ + +Board info +---------- + + .. image:: ./pic/raspberry-pi-3/raspberrypi_3b_shot_1.png + :alt: Board info + :width: 600 + +OS release +---------- + + .. image:: ./pic/raspberry-pi-3/raspberrypi_3b_shot_2.png + :alt: Output of /etc/os-release + :width: 600 + +lscpu +----- + + .. image:: ./pic/raspberry-pi-3/raspberrypi_3b_shot_4.png + :alt: Output of lscpu + :width: 600 + +cpuinfo +------- + + .. image:: ./pic/raspberry-pi-3/raspberrypi_3b_shot_5.png + :alt: cpuinfo + :width: 600 + +RTF (1 thread) +-------------- + + .. image:: ./pic/raspberry-pi-3/rtf-1-thread.jpg + :alt: RTF for 1 thread + :width: 600 + +RTF (2 threads) +--------------- + + .. image:: ./pic/raspberry-pi-3/rtf-2-threads.jpg + :alt: RTF for 2 threads + :width: 600 diff --git a/docs/source/ncnn/examples/vision-five-2.rst b/docs/source/ncnn/examples/vision-five-2.rst new file mode 100644 index 000000000..0f95e4bbb --- /dev/null +++ b/docs/source/ncnn/examples/vision-five-2.rst @@ -0,0 +1,103 @@ +VisionFive 2 +============ + +This page describes how to run `sherpa-ncnn`_ on `VisionFive2`_, which is a +64-bit RISC-V board with 4 CPUs. + +.. hint:: + + You can find pre-compiled binaries used in this example at + + ``_ + +.. caution:: + + The latest debian image from ``_ does not work since it does not support USB devices. + + That is, you cannot use USB microphones on the board with the above debian image. + +.. note:: + + We have compiled ``_ and + the resulting ``sdcard.img`` is available at ``_. + + Please use this image for testing. It supports USB microphones. + + The username for this image is ``root`` and the password is ``starfive``. + +Board info +---------- + +.. image:: ./pic/vision-five-2/lscpu.jpg + :alt: Output of lscpu and /proc/cpuinfo + :height: 450 + +RTF (4 threads) +--------------- + +We use :ref:`sherpa_ncnn_streaming_zipformer_small_bilingual_zh_en_2023_02_16` +for testing. The RTF is given below: + +.. image:: ./pic/vision-five-2/rtf.jpg + :alt: RTF for 4 threads with greedy search + :width: 800 + +You can see that the RTF is less than 1, which means it is able to +perform streaming (i.e., real-time) speech recognition. + +The following posts the commands used for testing so that you can +copy and paste them if you want to test it by yourself. + +.. code-block:: bash + + ./sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav \ + 4 \ + greedy_search + +Real-time speech recognition with a microphone +---------------------------------------------- + +Since the board does not have microphones, we use a USB microphone for testing. + +.. caution:: + + We use the image from ``_, + which provides support for USB microphones. + +After connecting a USB microphone to the board, use the following command to check it: + +.. image:: ./pic/vision-five-2/arecord.jpg + :alt: output of arecord -l + :width: 600 + +The output shows ``Card 2`` and ``device 0``, so the device name is ``hw:2,0``. + +The command to start the program for real-time speech recognition is + +.. code-block:: bash + + ./sherpa-ncnn-alsa \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/joiner_jit_trace-pnnx.ncnn.bin \ + hw:2,0 \ + 4 \ + greedy_search + +A screenshot is given below: + +.. image:: ./pic/vision-five-2/microphone-alsa.jpg + :alt: output of sherpa-ncnn-alsa + :width: 800 diff --git a/docs/source/ncnn/faq.rst b/docs/source/ncnn/faq.rst new file mode 100644 index 000000000..caf0aa529 --- /dev/null +++ b/docs/source/ncnn/faq.rst @@ -0,0 +1,42 @@ +FAQs +==== + +Where to get help +----------------- + +If you have any questions, please create an issue +at ``_ + +We also have active social groups: + + - 微信公众号: 新一代 Kaldi + - 微信交流群:请关注新一代 Kaldi, 添加工作人员微信, 我们邀请您进群 + - QQ 群:744602236 + + +No default input device found +----------------------------- + +If you are using Linux and if ``sherpa-ncnn-microphone`` throws the following error: + +.. code-block:: + + Num device: 0 + No default input device found. + +Please consider using ``sherpa-ncnn-alsa`` to replace ``sherpa-ncnn-microphone``. +If you cannot find ``sherpa-ncnn-alsa`` in ``./build/bin``, please run the +following commands: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + sudo apt-get install alsa-utils libasound2-dev + cd build + rm CMakeCache.txt # Important, remove the cmake cache file + make -j + +After the above commands, you should see a binary file ``./build/bin/sherpa-ncnn-alsa``. + + +Please follow :ref:`sherpa-ncnn-alsa` to use ``sherpa-ncnn-alsa``. diff --git a/docs/source/ncnn/index.rst b/docs/source/ncnn/index.rst new file mode 100644 index 000000000..e1ee6e60f --- /dev/null +++ b/docs/source/ncnn/index.rst @@ -0,0 +1,36 @@ +sherpa-ncnn +=========== + +.. hint:: + + During speech recognition, it does not need to access the Internet. + Everyting is processed locally on your device. + + +We support using `ncnn`_ to replace PyTorch for neural network computation. +The code is put in a separate repository `sherpa-ncnn`_ + +`sherpa-ncnn`_ is self-contained and everything can be compiled from source. + +Please refer to ``_ +for how to export models to `ncnn`_ format. + +In the following, we describe how to build `sherpa-ncnn`_ for Linux, macOS, +Windows, embedded systems, Android, and iOS. + +Also, we show how to use it for speech recognition with pre-trained models. + +.. toctree:: + :maxdepth: 2 + + ./tutorials/index + ./install/index + ./python/index + ./wasm/index + ./c-api/index + ./endpoint + ./android/index + ./ios/index + ./pretrained_models/index + ./examples/index + ./faq diff --git a/docs/source/ncnn/install/aarch64-embedded-linux.rst b/docs/source/ncnn/install/aarch64-embedded-linux.rst new file mode 100644 index 000000000..ea2c8e50d --- /dev/null +++ b/docs/source/ncnn/install/aarch64-embedded-linux.rst @@ -0,0 +1,214 @@ +Embedded Linux (aarch64) +======================== + +This page describes how to build `sherpa-ncnn`_ for embedded Linux (aarch64, 64-bit) +with cross-compiling on an x86 machine with Ubuntu OS. + +.. caution:: + + If you want to build `sherpa-ncnn`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_ncnn_on_linux` instead. + +.. caution:: + + If you want to build `sherpa-ncnn`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_ncnn_on_linux` instead. + +.. caution:: + + If you want to build `sherpa-ncnn`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_ncnn_on_linux` instead. + +.. hint:: + + This page is for cross-compiling. + +.. _sherpa_ncnn_install_for_aarch64_embedded_linux: + +Install toolchain +----------------- + +The first step is to install a toolchain for cross-compiling. + +.. warning:: + + You can use any toolchain that is suitable for your platform. The toolchain + we use below is just an example. + +Visit ``_ +to download the toolchain. + +We are going to download ``gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz``, +which has been uploaded to ``_. + +Assume you want to install it in the folder ``$HOME/software``: + +.. code-block:: bash + + mkdir -p $HOME/software + cd $HOME/software + wget https://huggingface.co/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + # wget https://hf-mirror.com/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz + tar xvf gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/software/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu/bin:$PATH + +To check that we have installed the cross-compiling toolchain successfully, please +run: + +.. code-block:: bash + + aarch64-linux-gnu-gcc --version + +which should print the following log: + +.. code-block:: + + aarch64-linux-gnu-gcc (Linaro GCC 7.5-2019.12) 7.5.0 + Copyright (C) 2017 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Congratulations! You have successfully installed a toolchain for cross-compiling +`sherpa-ncnn`_. + +Build sherpa-ncnn +----------------- + +Finally, let us build `sherpa-ncnn`_. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + ./build-aarch64-linux-gnu.sh + +After building, you will get two binaries: + +.. code-block:: bash + + $ ls -lh build-aarch64-linux-gnu/install/bin/ + total 10M + -rwxr-xr-x 1 kuangfangjun root 3.4M Jan 13 21:16 sherpa-ncnn + -rwxr-xr-x 1 kuangfangjun root 3.4M Jan 13 21:16 sherpa-ncnn-alsa + +That's it! + +.. hint:: + + - ``sherpa-ncnn`` is for decoding a single file + - ``sherpa-ncnn-alsa`` is for real-time speech recongition by reading + the microphone with `ALSA `_ + +.. _sherpa-ncnn-alsa: + +sherpa-ncnn-alsa +---------------- + +.. caution:: + + We recommend that you use ``sherpa-ncnn-alsa`` on embedded systems such + as Raspberry pi. + + You need to provide a ``device_name`` when invoking ``sherpa-ncnn-alsa``. + We describe below how to find the device name for your microphone. + + Run the following command: + + .. code-block:: bash + + arecord -l + + to list all avaliable microphones for recording. If it complains that + ``arecord: command not found``, please use ``sudo apt-get install alsa-utils`` + to install it. + + If the above command gives the following output: + + .. code-block:: bash + + **** List of CAPTURE Hardware Devices **** + card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + + In this case, I only have 1 microphone. It is ``card 3`` and that card + has only ``device 0``. To select ``card 3`` and ``device 0`` on that card, + we need to pass ``plughw:3,0`` to ``sherpa-ncnn-alsa``. (Note: It has the format + ``plughw:card_number,device_index``.) + + For instance, you have to use + + .. code-block:: bash + + ./bin/sherpa-ncnn-alsa \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.bin \ + "plughw:3,0" + + Please change the card number and also the device index on the selected card + accordingly in your own situation. Otherwise, you won't be able to record + with your microphone. + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +.. hint:: + + If you want to select a pre-trained model for Raspberry that can be + run on real-time, we recommend you to use + :ref:`marcoyang_sherpa_ncnn_conv_emformer_transducer_small_2023_01_09_english`. + + +Read below if you want to learn more. + +.. hint:: + + By default, all external dependencies are statically linked. That means, + the generated binaries are self-contained. + + You can use the following commands to check that and you will find + they depend only on system libraries. + + .. code-block:: bash + + $ readelf -d build-aarch64-linux-gnu/install/bin/sherpa-ncnn + + Dynamic section at offset 0x302a80 contains 30 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x000000000000000f (RPATH) Library rpath: [$ORIGIN] + + $ readelf -d build-aarch64-linux-gnu/install/bin/sherpa-ncnn-alsa + + Dynamic section at offset 0x34ea48 contains 31 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libasound.so.2] + 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x000000000000000f (RPATH) Library rpath: [$ORIGIN] + + +Please create an issue at ``_ +if you have any problems. diff --git a/docs/source/ncnn/install/arm-embedded-linux.rst b/docs/source/ncnn/install/arm-embedded-linux.rst new file mode 100644 index 000000000..1aaac0c35 --- /dev/null +++ b/docs/source/ncnn/install/arm-embedded-linux.rst @@ -0,0 +1,198 @@ +.. _sherpa-ncnn-embedded-linux-arm-install: + +Embedded Linux (arm) +==================== + +This page describes how to build `sherpa-ncnn`_ for embedded Linux (arm, 32-bit) +with cross-compiling on an x86 machine with Ubuntu OS. + +.. caution:: + + If you want to build `sherpa-ncnn`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_ncnn_on_linux` instead. + +.. caution:: + + If you want to build `sherpa-ncnn`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_ncnn_on_linux` instead. + +.. caution:: + + If you want to build `sherpa-ncnn`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_ncnn_on_linux` instead. + +.. hint:: + + This page is for cross-compiling. + +Install toolchain +----------------- + +The first step is to install a toolchain for cross-compiling. + +.. warning:: + + You can use any toolchain that is suitable for your platform. The toolchain + we use below is just an example. + +Visit ``_ to download the toolchain: + +We are going to download ``gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz``, +which has been uploaded to ``_. + +Assume you want to install it in the folder ``$HOME/software``: + +.. code-block:: bash + + mkdir -p $HOME/software + cd $HOME/software + wget https://huggingface.co/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz + tar xvf gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/software/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/bin:$PATH + +To check that we have installed the cross-compiling toolchain successfully, please +run: + +.. code-block:: bash + + arm-linux-gnueabihf-gcc --version + +which should print the following log: + +.. code-block:: + + arm-linux-gnueabihf-gcc (GNU Toolchain for the A-profile Architecture 8.3-2019.03 (arm-rel-8.36)) 8.3.0 + Copyright (C) 2018 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Congratulations! You have successfully installed a toolchain for cross-compiling +`sherpa-ncnn`_. + +Build sherpa-ncnn +----------------- + +Finally, let us build `sherpa-ncnn`_. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + ./build-arm-linux-gnueabihf.sh + +After building, you will get two binaries: + +.. code-block:: bash + + $ ls -lh build-arm-linux-gnueabihf/install/bin/ + + total 6.6M + -rwxr-xr-x 1 kuangfangjun root 2.2M Jan 14 21:46 sherpa-ncnn + -rwxr-xr-x 1 kuangfangjun root 2.2M Jan 14 21:46 sherpa-ncnn-alsa + +That's it! + +.. hint:: + + - ``sherpa-ncnn`` is for decoding a single file + - ``sherpa-ncnn-alsa`` is for real-time speech recongition by reading + the microphone with `ALSA `_ + +.. caution:: + + We recommend that you use ``sherpa-ncnn-alsa`` on embedded systems such + as Raspberry pi. + + You need to provide a ``device_name`` when invoking ``sherpa-ncnn-alsa``. + We describe below how to find the device name for your microphone. + + Run the following command: + + .. code-block:: bash + + arecord -l + + to list all avaliable microphones for recording. If it complains that + ``arecord: command not found``, please use ``sudo apt-get install alsa-utils`` + to install it. + + If the above command gives the following output: + + .. code-block:: bash + + **** List of CAPTURE Hardware Devices **** + card 0: Audio [Axera Audio], device 0: 49ac000.i2s_mst-es8328-hifi-analog es8328-hifi-analog-0 [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + + In this case, I only have 1 microphone. It is ``card 0`` and that card + has only ``device 0``. To select ``card 0`` and ``device 0`` on that card, + we need to pass ``plughw:0,0`` to ``sherpa-ncnn-alsa``. (Note: It has the format + ``plughw:card_number,device_index``.) + + For instance, you have to use + + .. code-block:: bash + + # Note: We use int8 models for encoder and joiner below. + ./bin/sherpa-ncnn-alsa \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.int8.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.int8.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.int8.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.int8.bin \ + "plughw:0,0" + + Please change the card number and also the device index on the selected card + accordingly in your own situation. Otherwise, you won't be able to record + with your microphone. + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +Read below if you want to learn more. + +.. hint:: + + By default, all external dependencies are statically linked. That means, + the generated binaries are self-contained. + + You can use the following commands to check that and you will find + they depend only on system libraries. + + .. code-block:: bash + + $ readelf -d build-arm-linux-gnueabihf/install/bin/sherpa-ncnn + + Dynamic section at offset 0x1c7ee8 contains 30 entries: + Tag Type Name/Value + 0x00000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x00000001 (NEEDED) Shared library: [libm.so.6] + 0x00000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x00000001 (NEEDED) Shared library: [libpthread.so.0] + 0x00000001 (NEEDED) Shared library: [libc.so.6] + 0x0000000f (RPATH) Library rpath: [$ORIGIN] + + $ readelf -d build-arm-linux-gnueabihf/install/bin/sherpa-ncnn-alsa + + Dynamic section at offset 0x22ded8 contains 32 entries: + Tag Type Name/Value + 0x00000001 (NEEDED) Shared library: [libasound.so.2] + 0x00000001 (NEEDED) Shared library: [libgomp.so.1] + 0x00000001 (NEEDED) Shared library: [libpthread.so.0] + 0x00000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x00000001 (NEEDED) Shared library: [libm.so.6] + 0x00000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x00000001 (NEEDED) Shared library: [libc.so.6] + 0x0000000f (RPATH) Library rpath: [$ORIGIN] + + +Please create an issue at ``_ +if you have any problems. diff --git a/docs/source/ncnn/install/index.rst b/docs/source/ncnn/install/index.rst new file mode 100644 index 000000000..847affa53 --- /dev/null +++ b/docs/source/ncnn/install/index.rst @@ -0,0 +1,25 @@ +.. _install_sherpa_ncnn: + +Installation +============ + +.. hint:: + + Please refer to :ref:`sherpa-ncnn-python-api` for its usage with Python. + +In this section, we describe how to install `sherpa-ncnn`_ for the following +platforms: + +.. toctree:: + :maxdepth: 2 + + videos + linux + macos + windows + arm-embedded-linux + aarch64-embedded-linux + riscv64-embedded-linux + +If you want to build an Android app, please refer to :ref:`sherpa-ncnn-android`. +If you want to build an iOS app, please refer to :ref:`sherpa-ncnn-ios`. diff --git a/docs/source/ncnn/install/linux.rst b/docs/source/ncnn/install/linux.rst new file mode 100644 index 000000000..b804af8af --- /dev/null +++ b/docs/source/ncnn/install/linux.rst @@ -0,0 +1,132 @@ +.. _install_sherpa_ncnn_on_linux: + +Linux +===== + +This page describes how to build `sherpa-ncnn`_ on Linux. + +.. hint:: + + You can follow this section if you want to build `sherpa-ncnn`_ directly + on your board. + +.. hint:: + + For the Python API, please refer to :ref:`sherpa-ncnn-python-api`. + +All you need is to run: + +.. tabs:: + + .. tab:: x86/x86_64 + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j6 + + .. tab:: 32-bit ARM + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon" \ + -DCMAKE_CXX_FLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon" \ + .. + make -j6 + + .. tab:: 64-bit ARM + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="-march=armv8-a" \ + -DCMAKE_CXX_FLAGS="-march=armv8-a" \ + .. + make -j6 + + +After building, you will find two executables inside the ``bin`` directory: + +.. code-block:: bash + + $ ls -lh bin/ + total 13M + -rwxr-xr-x 1 kuangfangjun root 6.5M Dec 18 11:31 sherpa-ncnn + -rwxr-xr-x 1 kuangfangjun root 6.5M Dec 18 11:31 sherpa-ncnn-microphone + +That's it! + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +Read below if you want to learn more. + +You can strip the binaries by + +.. code-block:: bash + + $ strip bin/sherpa-ncnn + $ strip bin/sherpa-ncnn-microphone + +After stripping, the file size of each binary is: + +.. code-block:: bash + + $ ls -lh bin/ + total 12M + -rwxr-xr-x 1 kuangfangjun root 5.8M Dec 18 11:35 sherpa-ncnn + -rwxr-xr-x 1 kuangfangjun root 5.8M Dec 18 11:36 sherpa-ncnn-microphone + +.. hint:: + + By default, all external dependencies are statically linked. That means, + the generated binaries are self-contained. + + You can use the following commands to check that and you will find + they depend only on system libraries. + + .. code-block:: + + $ readelf -d bin/sherpa-ncnn + + Dynamic section at offset 0x5c0650 contains 34 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libmvec.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x000000000000001d (RUNPATH) Library runpath: [$ORIGIN:] + + $ readelf -d bin/sherpa-ncnn-microphone + + Dynamic section at offset 0x5c45d0 contains 34 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libmvec.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x000000000000001d (RUNPATH) Library runpath: [$ORIGIN:] + +Please create an issue at ``_ +if you have any problems. diff --git a/docs/source/ncnn/install/macos.rst b/docs/source/ncnn/install/macos.rst new file mode 100644 index 000000000..4a0b42a23 --- /dev/null +++ b/docs/source/ncnn/install/macos.rst @@ -0,0 +1,81 @@ +macOS +===== + +This page describes how to build `sherpa-ncnn`_ on macOS. + +.. hint:: + + For the Python API, please refer to :ref:`sherpa-ncnn-python-api`. + +All you need is to run: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j6 + +After building, you will find two executables inside the ``bin`` directory: + +.. code-block:: bash + + $ ls -lh bin/ + total 24232 + -rwxr-xr-x 1 fangjun staff 5.9M Dec 18 12:39 sherpa-ncnn + -rwxr-xr-x 1 fangjun staff 6.0M Dec 18 12:39 sherpa-ncnn-microphone + +That's it! + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +Read below if you want to learn more. + +You can strip the binaries by + +.. code-block:: bash + + $ strip bin/sherpa-ncnn + $ strip bin/sherpa-ncnn-microphone + +After stripping, the file size of each binary is: + +.. code-block:: bash + + $ ls -lh bin/ + total 23000 + -rwxr-xr-x 1 fangjun staff 5.6M Dec 18 12:40 sherpa-ncnn + -rwxr-xr-x 1 fangjun staff 5.6M Dec 18 12:40 sherpa-ncnn-microphone + +.. hint:: + + By default, all external dependencies are statically linked. That means, + the generated binaries are self-contained. + + You can use the following commands to check that and you will find + they depend only on system libraries. + + .. code-block:: + + $ otool -L bin/sherpa-ncnn + bin/sherpa-ncnn: + /usr/local/opt/libomp/lib/libomp.dylib (compatibility version 5.0.0, current version 5.0.0) + /usr/lib/libc++.1.dylib (compatibility version 1.0.0, current version 902.1.0) + /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1281.100.1) + + $ otool -L bin/sherpa-ncnn-microphone + bin/sherpa-ncnn-microphone: + /System/Library/Frameworks/CoreAudio.framework/Versions/A/CoreAudio (compatibility version 1.0.0, current version 1.0.0) + /System/Library/Frameworks/AudioToolbox.framework/Versions/A/AudioToolbox (compatibility version 1.0.0, current version 1000.0.0) + /System/Library/Frameworks/AudioUnit.framework/Versions/A/AudioUnit (compatibility version 1.0.0, current version 1.0.0) + /System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation (compatibility version 150.0.0, current version 1677.104.0) + /System/Library/Frameworks/CoreServices.framework/Versions/A/CoreServices (compatibility version 1.0.0, current version 1069.24.0) + /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1281.100.1) + /usr/local/opt/libomp/lib/libomp.dylib (compatibility version 5.0.0, current version 5.0.0) + /usr/lib/libc++.1.dylib (compatibility version 1.0.0, current version 902.1.0) + +Please create an issue at ``_ +if you have any problems. diff --git a/docs/source/ncnn/install/riscv64-embedded-linux.rst b/docs/source/ncnn/install/riscv64-embedded-linux.rst new file mode 100644 index 000000000..7fdf15723 --- /dev/null +++ b/docs/source/ncnn/install/riscv64-embedded-linux.rst @@ -0,0 +1,146 @@ +Embedded Linux (riscv64) +======================== + +This page describes how to build `sherpa-ncnn`_ for embedded Linux (RISC-V, 64-bit) +with cross-compiling on an x64 machine with Ubuntu OS. + +.. hint:: + + We provide a colab notebook + |build sherpa-ncnn for risc-v colab notebook| + for you to try this section step by step. + + If you are using Windows/macOS or you don't want to setup your local environment + for cross-compiling, please use the above colab notebook. + +.. |build sherpa-ncnn for risc-v colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-ncnn/sherpa_ncnn_RISC_V.ipynb + + +Install toolchain +----------------- + +The first step is to install a toolchain for cross-compiling. + +.. code-block:: bash + + sudo apt-get install gcc-riscv64-linux-gnu + sudo apt-get install g++-riscv64-linux-gnu + +To check that you have installed the toolchain successfully, please run + +.. code-block:: bash + + $ riscv64-linux-gnu-gcc --version + riscv64-linux-gnu-gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 + Copyright (C) 2017 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + $ riscv64-linux-gnu-g++ --version + riscv64-linux-gnu-g++ (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 + Copyright (C) 2017 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + +Build sherpa-ncnn +----------------- + +Next, let us build `sherpa-ncnn`_. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + ./build-riscv64-linux-gnu.sh + +After building, you will get two binaries: + +.. code-block:: bash + + $ ls -lh build-riscv64-linux-gnu/install/bin/ + total 3.8M + -rwxr-xr-x 1 kuangfangjun root 1.9M May 23 22:12 sherpa-ncnn + -rwxr-xr-x 1 kuangfangjun root 1.9M May 23 22:12 sherpa-ncnn-alsa + +That's it! + +.. hint:: + + - ``sherpa-ncnn`` is for decoding a single file + - ``sherpa-ncnn-alsa`` is for real-time speech recongition by reading + the microphone with `ALSA `_ + +.. _sherpa-ncnn-alsa: + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +.. hint:: + + If you want to select a pre-trained model for `VisionFive 2 `_ + that can be run on real-time, we recommend you to use + :ref:`sherpa_ncnn_streaming_zipformer_small_bilingual_zh_en_2023_02_16`. + + You can use the following command with the above model: + + .. code-block:: bash + + ./sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/64/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/5.wav \ + 4 \ + greedy_search + +Read below if you want to learn more. + +.. hint:: + + By default, all external dependencies are statically linked. That means, + the generated binaries are self-contained. + + You can use the following commands to check that and you will find + they depend only on system libraries. + + .. code-block:: bash + + $ readelf -d build-riscv64-linux-gnu/install/bin/sherpa-ncnn + + Dynamic section at offset 0x1d6dc0 contains 31 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x0000000000000001 (NEEDED) Shared library: [ld-linux-riscv64-lp64d.so.1] + 0x000000000000001d (RUNPATH) Library runpath: [$ORIGIN] + 0x0000000000000020 (PREINIT_ARRAY) 0x1e18e0 + 0x0000000000000021 (PREINIT_ARRAYSZ) 0x8 + + $ readelf -d build-riscv64-linux-gnu/install/bin/sherpa-ncnn-alsa + + Dynamic section at offset 0x1d3db0 contains 32 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libasound.so.2] + 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x0000000000000001 (NEEDED) Shared library: [ld-linux-riscv64-lp64d.so.1] + 0x000000000000001d (RUNPATH) Library runpath: [$ORIGIN] + 0x0000000000000020 (PREINIT_ARRAY) 0x1de8c8 + 0x0000000000000021 (PREINIT_ARRAYSZ) 0x8 + +Please create an issue at ``_ +if you have any problems. diff --git a/docs/source/ncnn/install/videos.rst b/docs/source/ncnn/install/videos.rst new file mode 100644 index 000000000..4c515e02f --- /dev/null +++ b/docs/source/ncnn/install/videos.rst @@ -0,0 +1,20 @@ +Installation videos +=================== + +This section presents some videos about how to install and use `sherpa-ncnn`_. + +Window (64-bit) +--------------- + +The following `video `_ +shows how to install and use `sherpa-ncnn`_ on 64-bit Windows. + +Thanks to ``_ for his contribution. + +.. caution:: + + It is in Chinese. + +.. raw:: html + + diff --git a/docs/source/ncnn/install/windows.rst b/docs/source/ncnn/install/windows.rst new file mode 100644 index 000000000..4e1ae27df --- /dev/null +++ b/docs/source/ncnn/install/windows.rst @@ -0,0 +1,82 @@ +Windows +======= + +This page describes how to build `sherpa-ncnn`_ on Windows. + +.. hint:: + + For the Python API, please refer to :ref:`sherpa-ncnn-python-api`. + +.. hint:: + + MinGW is known not to work. + Please install ``Visual Studio`` before you continue. + +64-bit Windows (x64) +-------------------- + +All you need is to run: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + cmake --build . --config Release + +It will generate two executables inside ``./bin/Release/``: + + - ``sherpa-ncnn.exe``: For decoding a single wave file. + - ``sherpa-ncnn-microphone.exe``: For real-time speech recognition from a microphone + +That's it! + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +Please create an issue at ``_ +if you have any problems. + +32-bit Windows (x86) +-------------------- + +All you need is to run: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + + # Please select one toolset among VS 2015, 2017, 2019, and 2022 below + # We use VS 2022 as an example. + + # For Visual Studio 2015 + # cmake -T v140,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + # For Visual Studio 2017 + # cmake -T v141,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + # For Visual Studio 2019 + # cmake -T v142,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + # For Visual Studio 2022 + cmake -T v143,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + cmake --build . --config Release -- -m:6 + +It will generate two executables inside ``./bin/Release/``: + + - ``sherpa-ncnn.exe``: For decoding a single wave file. + - ``sherpa-ncnn-microphone.exe``: For real-time speech recognition from a microphone + +That's it! + +Please read :ref:`sherpa-ncnn-pre-trained-models` for usages about +the generated binaries. + +Please create an issue at ``_ +if you have any problems. diff --git a/docs/source/ncnn/ios/build-sherpa-ncnn-swift.rst b/docs/source/ncnn/ios/build-sherpa-ncnn-swift.rst new file mode 100644 index 000000000..aea0abdf1 --- /dev/null +++ b/docs/source/ncnn/ios/build-sherpa-ncnn-swift.rst @@ -0,0 +1,351 @@ +Build sherpa-ncnn for iOS +========================= + +This section describes how to build `sherpa-ncnn`_ for ``iPhone`` and ``iPad``. + +Requirement +----------- + +.. warning:: + + The minimum deployment requires the iOS version ``>= 13.0``. + + +Before we continue, please make sure the following requirements are satisfied: + +- macOS. It won't work on Windows or Linux. +- Xcode. The version ``14.2 (14C18)`` is known to work. Other versions may also work. +- CMake. CMake 3.25.1 is known to work. Other versions may also work. +- (Optional) iPhone or iPad. This is for testing the app on your device. + If you don't have a device, you can still run the app within a simulator on your Mac. + +.. caution:: + + If you get the following error:: + + CMake Error at toolchains/ios.toolchain.cmake:544 (get_filename_component): + get_filename_component called with incorrect number of arguments + Call Stack (most recent call first): + /usr/local/Cellar/cmake/3.29.0/share/cmake/Modules/CMakeDetermineSystem.cmake:146 (include) + CMakeLists.txt:2 (project) + + please run:: + + sudo xcode-select --install + sudo xcodebuild -license + + And then delete the build directory ``./build-ios`` and re-build. + + Please see also ``_. + +Download sherpa-ncnn +-------------------- + +First, let us download the source code of `sherpa-ncnn`_. + +.. note:: + + In the following, I will download `sherpa-ncnn`_ to + ``$HOME/open-source``, i.e., ``/Users/fangjun/open-source``, on my Mac. + + You can put it anywhere as you like. + +.. code-block:: bash + + mkdir -p $HOME/open-source + cd $HOME/open-source + git clone https://github.com/k2-fsa/sherpa-ncnn + +Build sherpa-ncnn (in commandline, C++ Part) +-------------------------------------------- + +After downloading `sherpa-ncnn`_, let us build the C++ part of `sherpa-ncnn`_. + +.. code-block:: bash + + cd $HOME/open-source/sherpa-ncnn/ + ./build-ios.sh + +It will generate a directory +``$HOME/open-source/sherpa-ncnn/build-ios``, which we have already pre-configured +for you in Xcode. + +.. hint:: + + You don't have to look at the generated files in ``$HOME/open-source/sherpa-ncnn/build-ios`` + to build an app. We have pre-configured it for you. + + If you are eager to learn more about the generated files or want to use + `sherpa-ncnn`_ in your own iOS project, please have a look + at :ref:`sherpa_ncnn_ios_swift_for_the_more_curious`. + + +Build sherpa-ncnn (in Xcode) +---------------------------- + +Use the following command to open `sherpa-ncnn`_ in Xcode: + +.. code-block:: bash + + cd $HOME/open-source/sherpa-ncnn/ios-swift/SherpaNcnn + open SherpaNcnn.xcodeproj + +It will start Xcode and you will see the following screenshot: + + .. figure:: ./pic/start-xcode-for-sherpa-ncnn.png + :alt: Screenshot after running the command ``open SherpaNcnn.xcodeproj`` + :width: 600 + :align: center + + Screenshot after running the command ``open SherpaNcnn.xcodeproj`` + +Please select ``Product -> Build`` to build the project. See the screenshot +below: + + .. figure:: ./pic/select-product-build.png + :alt: Screenshot for selecting ``Product -> Build`` + :width: 600 + :align: center + + Screenshot for selecting ``Product -> Build`` + +After finishing the build, you should see the following screenshot: + + .. figure:: ./pic/after-finishing-build.png + :alt: Screenshot after finishing the build. + :width: 100 + :align: center + + Screenshot after finishing the build. + +Congratulations! You have successfully built the project. Let us run the +project by selecting ``Product -> Run``, which is shown in the following +screenshot: + + .. figure:: ./pic/run-the-project.png + :alt: Screenshot for ``Product -> Run``. + :width: 600 + :align: center + + Screenshot for ``Product -> Run``. + +Please wait for a few seconds before Xcode starts the simulator. + +Unfortunately, it will throw the following error: + + .. figure:: ./pic/error-no-model.png + :alt: Screenshot for the error + :width: 600 + :align: center + + Screenshot for the error + +The reason for the above error is that we have not provided the pre-trained +model yet. + +The file `ViewController.swift `_ +pre-selects the pre-trained model to be :ref:`sherpa-ncnn-mixed-english-chinese-conv-emformer-model`, +shown in the screenshot below: + + .. figure:: ./pic/pre-trained-model-1.png + :alt: Screenshot for the pre-selected pre-trained model + :width: 600 + :align: center + + Screenshot for the pre-selected pre-trained model + +Let us add the pre-trained model :ref:`sherpa-ncnn-mixed-english-chinese-conv-emformer-model` +to Xcode. Please follow :ref:`sherpa-ncnn-mixed-english-chinese-conv-emformer-model` +to download it from `huggingface `_. +You can download it to any directory as you like. + +Please right click the project ``SherpaNcnn`` and select ``Add Files to "SherpaNcnn"...`` +in the popup menu, as is shown in the screenshot below: + + .. figure:: ./pic/step-to-add-pre-trained-model-1.png + :alt: Screenshot for adding files to SherpaNcnn + :width: 600 + :align: center + + Screenshot for adding files to SherpaNcnn + +In the popup dialog, switch to the folder where you just downloaded the pre-trained +model. + +In the screenshot below, it is the +folder ``/Users/fangjun/open-source/icefall-models/sherpa-ncnn-conv-emformer-transducer-2022-12-06``: + + .. figure:: ./pic/step-to-add-pre-trained-model-2.png + :alt: Screenshot for navigating to the folder containing the downloaded pre-trained + :width: 600 + :align: center + + Screenshot for navigating to the folder containing the downloaded pre-trained + +Select required files and click the button ``Add``: + + .. figure:: ./pic/step-to-add-pre-trained-model-3.png + :alt: Screenshot for selecting required files + :width: 600 + :align: center + + Screenshot for selecting required files + +After adding pre-trained model files to Xcode, you should see the following +screenshot: + + .. figure:: ./pic/step-to-add-pre-trained-model-4.png + :alt: Screenshot after add pre-trained model files + :width: 600 + :align: center + + Screenshot after add pre-trained model files + +At this point, you should be able to select the menu ``Product -> Run`` +to run the project and you should finally see the following screenshot: + + .. figure:: ./pic/run.png + :alt: Screenshot for a successful run. + :width: 600 + :align: center + + Screenshot for a successful run. + +Click the button to start recording! A screenshot is given below: + + .. figure:: ./pic/run-2.png + :alt: Screenshot for recording and recognition. + :width: 600 + :align: center + + Screenshot for recording and recognition. + +Congratulations! You have finally succeeded in running `sherpa-ncnn`_ with iOS, +though it is in a simulator. + +Please read below if you want to run `sherpa-ncnn`_ on your iPhone or iPad. + +Run sherpa-ncnn on your iPhone/iPad +----------------------------------- + +First, please make sure the iOS version of your iPhone/iPad is ``>= 13.0``. + +Click the menu ``Xcode -> Settings...``, as is shown in the following screenshot: + + .. figure:: ./pic/xcode-settings.png + :alt: Screenshot for ``Xcode -> Settings...`` + :width: 600 + :align: center + + Screenshot for ``Xcode -> Settings...`` + +In the popup dialog, please select ``Account`` and click ``+`` to add +your Apple ID, as is shown in the following ``screenshots``. + + .. figure:: ./pic/add-an-account.png + :alt: Screenshot for selecting ``Account`` and click ``+``. + :width: 600 + :align: center + + Screenshot for selecting ``Account`` and click ``+``. + + .. figure:: ./pic/add-an-account-2.png + :alt: Screenshot for selecting ``Apple ID`` and click ``Continue`` + :width: 600 + :align: center + + Screenshot for selecting ``Apple ID`` and click ``Continue`` + + .. figure:: ./pic/add-an-account-3.png + :alt: Screenshot for adding your Apple ID and click ``Next`` + :width: 600 + :align: center + + Screenshot for adding your Apple ID and click ``Next`` + + .. figure:: ./pic/add-an-account-4.png + :alt: Screenshot for entering your password and click ``Next`` + :width: 600 + :align: center + + Screenshot for entering your password and click ``Next`` + + .. figure:: ./pic/add-an-account-5.png + :alt: Screenshot after adding your Apple ID + :width: 600 + :align: center + + Screenshot after adding your Apple ID + +After adding your Apple ID, please connect your iPhone or iPad to your Mac +and select your device in Xcode. The following screenshot is an example +to select my iPhone. + + .. figure:: ./pic/select-device.png + :alt: Screenshot for selecting your device + :width: 600 + :align: center + + Screenshot for selecting your device + +Now your Xcode should look like below after selecting a device: + + .. figure:: ./pic/select-device-2.png + :alt: Screenshot after selecting your device + :width: 600 + :align: center + + Screenshot after selecting your device + +Please select ``Product -> Run`` again to run `sherpa-ncnn`_ on your selected +device, as is shown in the following screenshot: + + .. figure:: ./pic/run-3.png + :alt: Screenshot for selecting ``Product -> Run`` + :width: 600 + :align: center + + Screenshot for selecting ``Product -> Run`` + +After a successful build, check your iPhone/iPad and you should see the following +screenshot: + + .. figure:: ./pic/run-4.jpg + :alt: Screenshot for running sherpa-ncnn on your device + :width: 300 + :align: center + + Screenshot for running sherpa-ncnn on your device + +To fix that, please select ``Settings -> General -> Device Management`` +on your device + + .. figure:: ./pic/run-5.jpg + :alt: Screenshot for selecting `Settings -> General -> Device Management` on your device + :width: 300 + :align: center + + Screenshot for selecting `Settings -> General -> Device Management` on your device + +Please click ``Apple Development: csukuangfj...`` and click ``Trust "Apple Development: csukuangfj@g..."`` +in the subsequent dialog, as is shown below: + + .. figure:: ./pic/run-6.jpg + :alt: Screenshot for "Trust "Apple Development: csukuangfj@g..."" + :width: 300 + :align: center + + Screenshot for "Trust "Apple Development: csukuangfj@g..."" + +At this point, you should be able to run the app on your device. The following is a screenshot +about running it on my iPhone: + + .. figure:: ./pic/run-7.jpg + :alt: Screenshot for running `sherpa-ncnn`_ on iPhone + :width: 300 + :align: center + + Screenshot for running `sherpa-ncnn`_ on iPhone + + +Congratulations! You have successfully run `sherpa-ncnn`_ on your device! diff --git a/docs/source/ncnn/ios/demo-videos.rst b/docs/source/ncnn/ios/demo-videos.rst new file mode 100644 index 000000000..9577ad159 --- /dev/null +++ b/docs/source/ncnn/ios/demo-videos.rst @@ -0,0 +1,22 @@ +.. _sherpa-ncnn-ios-video-demos: + +Video demos +=========== + +In this page, we list some videos about using `sherpa-ncnn`_ for +real-time speech recognition on ``iPhone`` and ``iPad``. + + +Video 1: Chinese + English on iPhone 14 Pro (simulator) +------------------------------------------------------- + +.. raw:: html + + + +Video 2: Chinese + English on iPad 11 Pro (simulator) +----------------------------------------------------- + +.. raw:: html + + diff --git a/docs/source/ncnn/ios/for-the-more-curious-swift.rst b/docs/source/ncnn/ios/for-the-more-curious-swift.rst new file mode 100644 index 000000000..e2c63e2f9 --- /dev/null +++ b/docs/source/ncnn/ios/for-the-more-curious-swift.rst @@ -0,0 +1,245 @@ +.. _sherpa_ncnn_ios_swift_for_the_more_curious: + +For the more curious +==================== + +This section is for those who want to learn more about how to use +`sherpa-ncnn`_ in an iOS project. + +Files generated by running ./build-ios.sh +----------------------------------------- + +After running: + +.. code-block:: bash + + ./build-ios.sh + +You may be curious about the generated files. + +.. hint:: + + Please have a look at ``./build-ios.sh`` so that you know what it does for you. + +The above command generates files inside the directory ``./build-ios``: + +.. code-block:: bash + + sherpa-ncnn fangjun$ ls -lh build-ios + total 1912 + drwxr-xr-x 6 fangjun staff 192B Feb 26 16:48 build + drwxr-xr-x 4 fangjun staff 128B Feb 26 16:46 install + drwxr-xr-x 15 fangjun staff 480B Feb 14 18:09 openmp-11.0.0.src + -rw-r--r-- 1 fangjun staff 952K Dec 8 2021 openmp-11.0.0.src.tar.xz + drwxr-xr-x 6 fangjun staff 192B Feb 26 16:44 openmp.xcframework + drwxr-xr-x 6 fangjun staff 192B Feb 26 16:48 sherpa-ncnn.xcframework + +What is interesting here is the two framework folders ``openmp.xcframework`` +and ``sherpa-ncnn.xcframework``. +All other folders can be safely removed. We only need the two framework folders. + + +In the following, we describe the content in these two framework folders. + +openmp.xcframework +~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + $ tree build-ios/openmp.xcframework/ + build-ios/openmp.xcframework/ + ├── Headers + │   └── omp.h + ├── Info.plist + ├── ios-arm64 + │   └── libomp.a + └── ios-arm64_x86_64-simulator + └── libomp.a + + 3 directories, 4 files + +**Explanation**: + + - ``omp.h``: The header file, which is used by `ncnn`_ + - ``Info.plist``: A file that is dedicated for framework on macOS/iOS + - ``ios-arm64/libopm.a``: A static library for iOS device, e.g., for iPhone + - ``ios-arm64_x86_64-simulator/libomp.a``: A static library for iOS + simulators, including simulators for Intel chips and Apple Silicon (e.g., M1) + +.. code-block:: bash + + sherpa-ncnn fangjun$ file build-ios/openmp.xcframework/ios-arm64_x86_64-simulator/libomp.a + build-ios/openmp.xcframework/ios-arm64_x86_64-simulator/libomp.a: Mach-O universal binary with 2 architectures: [x86_64:current ar archive random library] [arm64:current ar archive random library] + build-ios/openmp.xcframework/ios-arm64_x86_64-simulator/libomp.a (for architecture x86_64): current ar archive random library + build-ios/openmp.xcframework/ios-arm64_x86_64-simulator/libomp.a (for architecture arm64): current ar archive random library + + sherpa-ncnn fangjun$ lipo -info build-ios/openmp.xcframework/ios-arm64_x86_64-simulator/libomp.a + Architectures in the fat file: build-ios/openmp.xcframework/ios-arm64_x86_64-simulator/libomp.a are: x86_64 arm64 + + sherpa-ncnn fangjun$ file build-ios/openmp.xcframework/ios-arm64/libomp.a + build-ios/openmp.xcframework/ios-arm64/libomp.a: current ar archive random library + + sherpa-ncnn fangjun$ lipo -info build-ios/openmp.xcframework/ios-arm64/libomp.a + Non-fat file: build-ios/openmp.xcframework/ios-arm64/libomp.a is architecture: arm64 + +sherpa-ncnn.xcframework +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + sherpa-ncnn fangjun$ tree build-ios/sherpa-ncnn.xcframework/ + build-ios/sherpa-ncnn.xcframework/ + ├── Headers + │   └── sherpa-ncnn + │   └── c-api + │   └── c-api.h + ├── Info.plist + ├── ios-arm64 + │   └── sherpa-ncnn.a + └── ios-arm64_x86_64-simulator + └── sherpa-ncnn.a + + 5 directories, 4 files + +**Explanation**: + + - ``c-api.h``: The header file, which is copied from + ``_ + - ``Info.plist``: A file that is dedicated for framework on macOS/iOS + - ``ios-arm64/sherpa-ncnn.a``: A static library for iOS, e.g., iPhone + - ``ios-arm64_x86_64-simulator/sherpa-ncnn.a``: A static library for + simulators, including simulators for Intel chips and Apple Silicon (e.g., M1) + +.. code-block:: bash + + sherpa-ncnn fangjun$ file build-ios/sherpa-ncnn.xcframework/ios-arm64_x86_64-simulator/sherpa-ncnn.a + build-ios/sherpa-ncnn.xcframework/ios-arm64_x86_64-simulator/sherpa-ncnn.a: Mach-O universal binary with 2 architectures: [x86_64:current ar archive] [arm64] + build-ios/sherpa-ncnn.xcframework/ios-arm64_x86_64-simulator/sherpa-ncnn.a (for architecture x86_64): current ar archive + build-ios/sherpa-ncnn.xcframework/ios-arm64_x86_64-simulator/sherpa-ncnn.a (for architecture arm64): current ar archive + + sherpa-ncnn fangjun$ lipo -info build-ios/sherpa-ncnn.xcframework/ios-arm64_x86_64-simulator/sherpa-ncnn.a + Architectures in the fat file: build-ios/sherpa-ncnn.xcframework/ios-arm64_x86_64-simulator/sherpa-ncnn.a are: x86_64 arm64 + + sherpa-ncnn fangjun$ file build-ios/sherpa-ncnn.xcframework/ios-arm64/sherpa-ncnn.a + build-ios/sherpa-ncnn.xcframework/ios-arm64/sherpa-ncnn.a: current ar archive + + sherpa-ncnn fangjun$ lipo -info build-ios/sherpa-ncnn.xcframework/ios-arm64/sherpa-ncnn.a + Non-fat file: build-ios/sherpa-ncnn.xcframework/ios-arm64/sherpa-ncnn.a is architecture: arm64 + +How to use files generated by ./build-ios.sh in Xcode +----------------------------------------------------- + +In this section, we describe how to use ``openmp.xcframework`` and +``sherpa-ncnn.xcframework`` in Xcode. + +The underlying implementation of `sherpa-ncnn`_ is in C++. It also provides +`C API `_. + +To use ``C API`` in Xcode with `Swift `_, we have to +write a `bridging header `_. + +We provide a bridging header for you: `SherpaNcnn-Bridging-Header.h `_. All you need is to add this file to your iOS project +and click ``Build Settings -> Swift Compiler - General`` and set ``Objective-C Bridging Header`` +to ``${PROJECT_DIR}/../../swift-api-examples/SherpaNcnn-Bridging-Header.h``. See +the screenshot below for reference: + + .. figure:: ./pic/set-bridging-header.png + :alt: Screenshot for setting the bridging header + :width: 600 + :align: center + + Screenshot for setting the bridging header + +We list the content of the bridging header below for reference: + +.. code-block:: swift + + #ifndef SWIFT_API_EXAMPLES_SHERPANCNN_BRIDGING_HEADER_H_ + #define SWIFT_API_EXAMPLES_SHERPANCNN_BRIDGING_HEADER_H_ + + #import "sherpa-ncnn/c-api/c-api.h" + + #endif // SWIFT_API_EXAMPLES_SHERPANCNN_BRIDGING_HEADER_H_ + +After adding the bridging header to your iOS project, Xcode will complain +it cannot find ``sherpa-ncnn/c-api/c-api.h``. The fix is to add the path +``build-ios/sherpa-ncnn.xcframework/Headers`` to ``Header Search Paths`` by changing +``Build Settings -> Search Paths -> Header Search Paths``, as is shown in the +following screenshot: + + .. figure:: ./pic/header-search-path.png + :alt: Screenshot for setting the header search paths + :width: 600 + :align: center + + Screenshot for setting the header search paths + +.. hint:: + + Instead of using an absolute path, we use + ``${PROJECT_DIR}/../../build-ios/sherpa-ncnn.xcframework/Headers/`` + + For instance, my `sherpa-ncnn`_ is downloaded to + ``/Users/fangjun/open-source/sherpa-ncnn`` and the path to ``sherpa-ncnn.xcframework`` + is ``/Users/fangjun/open-source/sherpa-ncnn/build-ios/sherpa-ncnn.xcframework``. + + The value of ``PROJECT_DIR`` is + ``/Users/fangjun/open-source/sherpa-ncnn/ios-swift/SherpaNcnn``, so + we can use ``${PROJECT_DIR}/../../build-ios/sherpa-ncnn.xcframework/Headers/``. + + Also note that ``PROJECT_DIR`` is a pre-defined variable in Xcode. + +Please also add `SherpaNcnn.swift `_ +to your iOS project, which is a utility to make it easier to access functions from the bridging header. + +The next thing is to add ``openmp.xcframework`` and ``sherpa-ncnn.xcframework`` +as dependencies to you iOS project. Select ``Build Phases -> Link Binary with Libraries`` +and then click ``+`` to add ``sherpa-ncnn.xcframework`` and ``openmp.xcframework``. +See the screenshot below for reference: + + .. figure:: ./pic/add-framework-to-your-project.png + :alt: Screenshot for adding a framework to your project + :width: 600 + :align: center + + Screenshot for adding a framework to your project + +.. hint:: + + After clicking ``+``, please select ``Add Other... -> Add Files..``, and then + add the path to ``sherpa-ncnn.xcframework``. + + Repeat the step for ``openmp.xcframework``. + + See the screenshot below for reference: + + .. figure:: ./pic/add-other.png + :alt: Screenshot for adding a framework + :width: 300 + :align: center + + Screenshot for adding a framework + +One more thing you need to do after adding the framework is to setup the framework +search path. Click ``Build Settings -> Search Paths -> Framework Search Paths`` +and add the path to ``build-ios/``. See the screenshot below: + + .. figure:: ./pic/set-framework-search-path.png + :alt: Screenshot for setting framework search paths + :width: 600 + :align: center + + Screenshot for setting framework search paths + +If you encounter link errors about the C++ standard library, please add +``-lc++`` to link against ``libc++`` by clicking ``Build Settings -> Linking -> Other Linker Flags`` +and adding ``-lc++``. See the screenshot below for reference: + + .. figure:: ./pic/link-libc++.png + :alt: Screenshot for adding ``-lc++`` to linker flags + :width: 600 + :align: center + + Screenshot for adding ``-lc++`` to linker flags + +That is all you need to add `sherpa-ncnn`_ to your iOS project. diff --git a/docs/source/ncnn/ios/index.rst b/docs/source/ncnn/ios/index.rst new file mode 100644 index 000000000..b46c34a63 --- /dev/null +++ b/docs/source/ncnn/ios/index.rst @@ -0,0 +1,22 @@ +.. _sherpa-ncnn-ios: + +iOS +=== + +In this section, we describe how to build an iOS app for ``real-time`` speech +recognition with `sherpa-ncnn`_ and run it within a simulator on your Mac, +run it on you iPhone or iPad. + +We also provide video demos for real-time speech recognition. + +.. hint:: + + During speech recognition, it does not need to access the Internet. + Everyting is processed locally on your iPhone or iPad. + +.. toctree:: + :maxdepth: 3 + + demo-videos + build-sherpa-ncnn-swift + for-the-more-curious-swift diff --git a/docs/source/ncnn/ios/pic/add-an-account-2.png b/docs/source/ncnn/ios/pic/add-an-account-2.png new file mode 100644 index 000000000..55fec26a7 Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-an-account-2.png differ diff --git a/docs/source/ncnn/ios/pic/add-an-account-3.png b/docs/source/ncnn/ios/pic/add-an-account-3.png new file mode 100644 index 000000000..f8b0a5f49 Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-an-account-3.png differ diff --git a/docs/source/ncnn/ios/pic/add-an-account-4.png b/docs/source/ncnn/ios/pic/add-an-account-4.png new file mode 100644 index 000000000..2181f535e Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-an-account-4.png differ diff --git a/docs/source/ncnn/ios/pic/add-an-account-5.png b/docs/source/ncnn/ios/pic/add-an-account-5.png new file mode 100644 index 000000000..43595ed90 Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-an-account-5.png differ diff --git a/docs/source/ncnn/ios/pic/add-an-account.png b/docs/source/ncnn/ios/pic/add-an-account.png new file mode 100644 index 000000000..2ae37a54a Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-an-account.png differ diff --git a/docs/source/ncnn/ios/pic/add-framework-to-your-project.png b/docs/source/ncnn/ios/pic/add-framework-to-your-project.png new file mode 100644 index 000000000..c0ed3e70c Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-framework-to-your-project.png differ diff --git a/docs/source/ncnn/ios/pic/add-other.png b/docs/source/ncnn/ios/pic/add-other.png new file mode 100644 index 000000000..fed3f8adb Binary files /dev/null and b/docs/source/ncnn/ios/pic/add-other.png differ diff --git a/docs/source/ncnn/ios/pic/after-finishing-build.png b/docs/source/ncnn/ios/pic/after-finishing-build.png new file mode 100644 index 000000000..7b8ed2379 Binary files /dev/null and b/docs/source/ncnn/ios/pic/after-finishing-build.png differ diff --git a/docs/source/ncnn/ios/pic/error-no-model.png b/docs/source/ncnn/ios/pic/error-no-model.png new file mode 100644 index 000000000..7c233f26a Binary files /dev/null and b/docs/source/ncnn/ios/pic/error-no-model.png differ diff --git a/docs/source/ncnn/ios/pic/header-search-path.png b/docs/source/ncnn/ios/pic/header-search-path.png new file mode 100644 index 000000000..e2a9ca7fb Binary files /dev/null and b/docs/source/ncnn/ios/pic/header-search-path.png differ diff --git a/docs/source/ncnn/ios/pic/link-libc++.png b/docs/source/ncnn/ios/pic/link-libc++.png new file mode 100644 index 000000000..de7d1f2b4 Binary files /dev/null and b/docs/source/ncnn/ios/pic/link-libc++.png differ diff --git a/docs/source/ncnn/ios/pic/pre-trained-model-1.png b/docs/source/ncnn/ios/pic/pre-trained-model-1.png new file mode 100644 index 000000000..1544830a2 Binary files /dev/null and b/docs/source/ncnn/ios/pic/pre-trained-model-1.png differ diff --git a/docs/source/ncnn/ios/pic/run-2.png b/docs/source/ncnn/ios/pic/run-2.png new file mode 100644 index 000000000..2853cfc6e Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-2.png differ diff --git a/docs/source/ncnn/ios/pic/run-3.png b/docs/source/ncnn/ios/pic/run-3.png new file mode 100644 index 000000000..5a5ce5b57 Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-3.png differ diff --git a/docs/source/ncnn/ios/pic/run-4.jpg b/docs/source/ncnn/ios/pic/run-4.jpg new file mode 100644 index 000000000..1574a1ddd Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-4.jpg differ diff --git a/docs/source/ncnn/ios/pic/run-5.jpg b/docs/source/ncnn/ios/pic/run-5.jpg new file mode 100644 index 000000000..8461a0d7c Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-5.jpg differ diff --git a/docs/source/ncnn/ios/pic/run-6.jpg b/docs/source/ncnn/ios/pic/run-6.jpg new file mode 100644 index 000000000..9cfd4a33c Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-6.jpg differ diff --git a/docs/source/ncnn/ios/pic/run-7.jpg b/docs/source/ncnn/ios/pic/run-7.jpg new file mode 100644 index 000000000..c522f097b Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-7.jpg differ diff --git a/docs/source/ncnn/ios/pic/run-the-project.png b/docs/source/ncnn/ios/pic/run-the-project.png new file mode 100644 index 000000000..e5c6bb42b Binary files /dev/null and b/docs/source/ncnn/ios/pic/run-the-project.png differ diff --git a/docs/source/ncnn/ios/pic/run.png b/docs/source/ncnn/ios/pic/run.png new file mode 100644 index 000000000..350b6db8a Binary files /dev/null and b/docs/source/ncnn/ios/pic/run.png differ diff --git a/docs/source/ncnn/ios/pic/select-device-2.png b/docs/source/ncnn/ios/pic/select-device-2.png new file mode 100644 index 000000000..0b7b5add2 Binary files /dev/null and b/docs/source/ncnn/ios/pic/select-device-2.png differ diff --git a/docs/source/ncnn/ios/pic/select-device.png b/docs/source/ncnn/ios/pic/select-device.png new file mode 100644 index 000000000..c2acbe6d8 Binary files /dev/null and b/docs/source/ncnn/ios/pic/select-device.png differ diff --git a/docs/source/ncnn/ios/pic/select-product-build.png b/docs/source/ncnn/ios/pic/select-product-build.png new file mode 100644 index 000000000..3ad430fe1 Binary files /dev/null and b/docs/source/ncnn/ios/pic/select-product-build.png differ diff --git a/docs/source/ncnn/ios/pic/set-bridging-header.png b/docs/source/ncnn/ios/pic/set-bridging-header.png new file mode 100644 index 000000000..7cb23c7b8 Binary files /dev/null and b/docs/source/ncnn/ios/pic/set-bridging-header.png differ diff --git a/docs/source/ncnn/ios/pic/set-framework-search-path.png b/docs/source/ncnn/ios/pic/set-framework-search-path.png new file mode 100644 index 000000000..5ffe3ccdb Binary files /dev/null and b/docs/source/ncnn/ios/pic/set-framework-search-path.png differ diff --git a/docs/source/ncnn/ios/pic/start-xcode-for-sherpa-ncnn.png b/docs/source/ncnn/ios/pic/start-xcode-for-sherpa-ncnn.png new file mode 100644 index 000000000..fe87282a2 Binary files /dev/null and b/docs/source/ncnn/ios/pic/start-xcode-for-sherpa-ncnn.png differ diff --git a/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-1.png b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-1.png new file mode 100644 index 000000000..196dde51e Binary files /dev/null and b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-1.png differ diff --git a/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-2.png b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-2.png new file mode 100644 index 000000000..8fbab6a13 Binary files /dev/null and b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-2.png differ diff --git a/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-3.png b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-3.png new file mode 100644 index 000000000..b8d584f70 Binary files /dev/null and b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-3.png differ diff --git a/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-4.png b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-4.png new file mode 100644 index 000000000..7cd845a0a Binary files /dev/null and b/docs/source/ncnn/ios/pic/step-to-add-pre-trained-model-4.png differ diff --git a/docs/source/ncnn/ios/pic/xcode-settings.png b/docs/source/ncnn/ios/pic/xcode-settings.png new file mode 100644 index 000000000..dfb6e7f98 Binary files /dev/null and b/docs/source/ncnn/ios/pic/xcode-settings.png differ diff --git a/docs/source/ncnn/pic/rule1.png b/docs/source/ncnn/pic/rule1.png new file mode 100644 index 000000000..9273d3699 Binary files /dev/null and b/docs/source/ncnn/pic/rule1.png differ diff --git a/docs/source/ncnn/pic/rule2.png b/docs/source/ncnn/pic/rule2.png new file mode 100644 index 000000000..d606e0881 Binary files /dev/null and b/docs/source/ncnn/pic/rule2.png differ diff --git a/docs/source/ncnn/pic/rule3.png b/docs/source/ncnn/pic/rule3.png new file mode 100644 index 000000000..01a5735a4 Binary files /dev/null and b/docs/source/ncnn/pic/rule3.png differ diff --git a/docs/source/ncnn/pretrained_models/code-lstm/2022-09-05.txt b/docs/source/ncnn/pretrained_models/code-lstm/2022-09-05.txt new file mode 100644 index 000000000..5d4ca92c4 --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-lstm/2022-09-05.txt @@ -0,0 +1,16 @@ +ModelConfig(encoder_param="./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-2022-09-05/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="greedy_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-2022-09-05/test_wavs/1089-134686-0001.wav +wav duration (s): 6.625 +Started! +Done! +Recognition result for ./sherpa-ncnn-2022-09-05/test_wavs/1089-134686-0001.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +ModelConfig(encoder_param="./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-2022-09-05/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-2022-09-05/test_wavs/1089-134686-0001.wav +wav duration (s): 6.625 +Started! +Done! +Recognition result for ./sherpa-ncnn-2022-09-05/test_wavs/1089-134686-0001.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS diff --git a/docs/source/ncnn/pretrained_models/code-lstm/2022-09-30.txt b/docs/source/ncnn/pretrained_models/code-lstm/2022-09-30.txt new file mode 100644 index 000000000..3424e742f --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-lstm/2022-09-30.txt @@ -0,0 +1,16 @@ +ModelConfig(encoder_param="./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-2022-09-30/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="greedy_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-2022-09-30/test_wavs/0.wav +wav duration (s): 5.61462 +Started! +Done! +Recognition result for ./sherpa-ncnn-2022-09-30/test_wavs/0.wav +对我做了介绍那么我想说的是呢大家如果对我的研究感兴趣 +ModelConfig(encoder_param="./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-2022-09-30/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-2022-09-30/test_wavs/0.wav +wav duration (s): 5.61462 +Started! +Done! +Recognition result for ./sherpa-ncnn-2022-09-30/test_wavs/0.wav +对我做了介绍那么我想说的是呢大家如果对我的研究感兴趣 diff --git a/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13-sherpa-ncnn.txt b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13-sherpa-ncnn.txt new file mode 100644 index 000000000..ce0b11cae --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13-sherpa-ncnn.txt @@ -0,0 +1,20 @@ +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="greedy_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/test_wavs/1.wav +wav duration (s): 5.1 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/test_wavs/1.wav +这是第一种第二种叫呃与 ALWAYS ALWAYS什么意思啊 +Elapsed seconds: 0.598 s +Real time factor (RTF): 0.598 / 5.100 = 0.117 +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/test_wavs/1.wav +wav duration (s): 5.1 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/test_wavs/1.wav +这是第一种第二种叫呃与 ALWAYS ALWAYS什么意思啊 +Elapsed seconds: 0.943 s +Real time factor (RTF): 0.943 / 5.100 = 0.185 diff --git a/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-en-2023-02-13-sherpa-ncnn.txt b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-en-2023-02-13-sherpa-ncnn.txt new file mode 100644 index 000000000..7d23077aa --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-en-2023-02-13-sherpa-ncnn.txt @@ -0,0 +1,20 @@ +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="greedy_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/test_wavs/1221-135766-0002.wav +wav duration (s): 4.825 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/test_wavs/1221-135766-0002.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +Elapsed seconds: 0.569 s +Real time factor (RTF): 0.569 / 4.825 = 0.118 +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-en-2023-02-13/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/test_wavs/1221-135766-0002.wav +wav duration (s): 4.825 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/test_wavs/1221-135766-0002.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +Elapsed seconds: 0.554 s +Real time factor (RTF): 0.554 / 4.825 = 0.115 diff --git a/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-fr-2023-04-14.txt b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-fr-2023-04-14.txt new file mode 100644 index 000000000..7adaf2ba4 --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-fr-2023-04-14.txt @@ -0,0 +1,20 @@ +RecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2), decoder_config=DecoderConfig(method="greedy_search", num_active_paths=4), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=False) +wav filename: ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav +wav duration (s): 7.128 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav +text: CE SITE CONTIENT QUATRE TOMBEAUX DE LA DYNASTIE ASHÉMÉNIDE ET SEPT DES SASSANDIDES +timestamps: 0.96 1.44 1.52 1.76 1.96 2.08 2.28 2.56 2.64 2.76 2.8 2.96 3.04 3.2 3.28 3.4 3.48 3.72 3.8 4 4.16 4.24 4.32 4.44 4.6 4.68 4.92 5.2 5.52 5.84 6.04 6.12 6.24 6.56 6.68 +Elapsed seconds: 1.082 s +Real time factor (RTF): 1.082 / 7.128 = 0.152 +RecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2), decoder_config=DecoderConfig(method="modified_beam_search", num_active_paths=4), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=False) +wav filename: ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav +wav duration (s): 7.128 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav +text: CE SITE CONTIENT QUATRE TOMBEAUX DE LA DYNASTIE ASHÉMÉNIDE ET SEPT DES SASSANDIDES +timestamps: 0.96 1.44 1.52 1.76 1.96 2.08 2.28 2.56 2.64 2.76 2.8 2.96 3.04 3.2 3.28 3.4 3.48 3.72 3.8 4 4.16 4.24 4.32 4.44 4.6 4.68 4.92 5.2 5.52 5.84 6.04 6.12 6.24 6.56 6.68 +Elapsed seconds: 0.812 s +Real time factor (RTF): 0.812 / 7.128 = 0.114 diff --git a/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-small-20M-en-2023-02-19.txt b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-small-20M-en-2023-02-19.txt new file mode 100644 index 000000000..ab69ef4b7 --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-small-20M-en-2023-02-19.txt @@ -0,0 +1,12 @@ +Disable fp16 for Zipformer encoder +Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1 +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="greedy_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/test_wavs/0.wav +wav duration (s): 6.625 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLELS +Elapsed seconds: 0.472 s +Real time factor (RTF): 0.472 / 6.625 = 0.071 \ No newline at end of file diff --git a/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt new file mode 100644 index 000000000..05ff5b03e --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt @@ -0,0 +1,16 @@ +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="greedy_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/1.wav +wav duration (s): 5.1 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/1.wav +这是第一种第二种叫呃与 ALWAYS什么意思啊 +ModelConfig(encoder_param="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.bin", tokens="./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt", encoder num_threads=2, decoder num_threads=2, joiner num_threads=2) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/1.wav +wav duration (s): 5.1 +Started! +Done! +Recognition result for ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/1.wav +这是第一种第二种叫呃与 ALWAYS什么意思啊 diff --git a/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-zh-small-14M-2023-02-23.txt b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-zh-small-14M-2023-02-23.txt new file mode 100644 index 000000000..f6101d43b --- /dev/null +++ b/docs/source/ncnn/pretrained_models/code-zipformer/sherpa-ncnn-streaming-zipformer-zh-small-14M-2023-02-23.txt @@ -0,0 +1,38 @@ +Disable fp16 for Zipformer encoder +Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1 +ModelConfig(encoder_param="pruned_transducer_stateless7_streaming/exp-small-L/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="pruned_transducer_stateless7_streaming/exp-small-L/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="pruned_transducer_stateless7_streaming/exp-small-L/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="pruned_transducer_stateless7_streaming/exp-small-L/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="pruned_transducer_stateless7_streaming/exp-small-L/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="pruned_transducer_stateless7_streaming/exp-small-L/joiner_jit_trace-pnnx.ncnn.bin", tokens="data/lang_char/tokens.txt", encoder num_threads=4, decoder num_threads=4, joiner num_threads=4) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./test_wavs_zh/0.wav +wav duration (s): 5.6115 +Started! +Done! +Recognition result for ./test_wavs_zh/0.wav +对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +Elapsed seconds: 0.678 s +Real time factor (RTF): 0.678 / 5.611 = 0.121 + +Disable fp16 for Zipformer encoder +Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1 +ModelConfig(encoder_param="pruned_transducer_stateless7_streaming/exp-small-L/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="pruned_transducer_stateless7_streaming/exp-small-L/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="pruned_transducer_stateless7_streaming/exp-small-L/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="pruned_transducer_stateless7_streaming/exp-small-L/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="pruned_transducer_stateless7_streaming/exp-small-L/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="pruned_transducer_stateless7_streaming/exp-small-L/joiner_jit_trace-pnnx.ncnn.bin", tokens="data/lang_char/tokens.txt", encoder num_threads=4, decoder num_threads=4, joiner num_threads=4) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./test_wavs_zh/1.wav +wav duration (s): 5.15306 +Started! +Done! +Recognition result for ./test_wavs_zh/1.wav +重点想谈三个问题首先就是这一轮全球金融动的表现 +Elapsed seconds: 0.676 s +Real time factor (RTF): 0.676 / 5.153 = 0.131 + +Disable fp16 for Zipformer encoder +Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1 +ModelConfig(encoder_param="pruned_transducer_stateless7_streaming/exp-small-L/encoder_jit_trace-pnnx.ncnn.param", encoder_bin="pruned_transducer_stateless7_streaming/exp-small-L/encoder_jit_trace-pnnx.ncnn.bin", decoder_param="pruned_transducer_stateless7_streaming/exp-small-L/decoder_jit_trace-pnnx.ncnn.param", decoder_bin="pruned_transducer_stateless7_streaming/exp-small-L/decoder_jit_trace-pnnx.ncnn.bin", joiner_param="pruned_transducer_stateless7_streaming/exp-small-L/joiner_jit_trace-pnnx.ncnn.param", joiner_bin="pruned_transducer_stateless7_streaming/exp-small-L/joiner_jit_trace-pnnx.ncnn.bin", tokens="data/lang_char/tokens.txt", encoder num_threads=4, decoder num_threads=4, joiner num_threads=4) +DecoderConfig(method="modified_beam_search", num_active_paths=4, enable_endpoint=False, endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.4, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20))) +wav filename: ./test_wavs_zh/2.wav +wav duration (s): 4.52431 +Started! +Done! +Recognition result for ./test_wavs_zh/2.wav +深入地分析这一次全球金融动荡背后的根源 +Elapsed seconds: 0.592 s +Real time factor (RTF): 0.592 / 4.524 = 0.131 \ No newline at end of file diff --git a/docs/source/ncnn/pretrained_models/conv-emformer-transducer-models.rst b/docs/source/ncnn/pretrained_models/conv-emformer-transducer-models.rst new file mode 100644 index 000000000..6b0443a41 --- /dev/null +++ b/docs/source/ncnn/pretrained_models/conv-emformer-transducer-models.rst @@ -0,0 +1,437 @@ +Conv-Emformer-transducer-based Models +===================================== + +.. hint:: + + Please refer to :ref:`install_sherpa_ncnn` to install `sherpa-ncnn`_ + before you read this section. + +.. _marcoyang_sherpa_ncnn_conv_emformer_transducer_small_2023_01_09_english: + +marcoyang/sherpa-ncnn-conv-emformer-transducer-small-2023-01-09 (English) +------------------------------------------------------------------------- + +This model is a small version of `conv-emformer-transducer `_ +trained in `icefall`_. + +It only has ``8.8 million parameters`` and can be deployed on ``embedded devices`` +for real-time speech recognition. You can find the models in ``fp16`` and ``int8`` format +at ``_. + +This model is trained using `LibriSpeech`_ and thus it supports only English. + +In the following, we show you how to download it and +deploy it with `sherpa-ncnn`_ on an embedded device, whose CPU is +`RV1126 `_ +(Quad core ARM Cortex-A7) + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-conv-emformer-transducer-small-2023-01-09.tar.bz2 + tar xvf sherpa-ncnn-conv-emformer-transducer-small-2023-01-09.tar.bz2 + +.. note:: + + Please refer to :ref:`sherpa-ncnn-embedded-linux-arm-install` for how to + compile `sherpa-ncnn`_ for a 32-bit ARM platform. In the following, we + test the pre-trained model on an embedded device, whose CPU is + `RV1126 `_ + (Quad core ARM Cortex-A7). + +Decode a single wave file with ./build/bin/sherpa-ncnn +:::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/test_wavs/1089-134686-0001.wav \ + +The outputs are shown below. The CPU used for decoding is RV1126 (Quad core ARM Cortex-A7). + +.. figure:: ./pic/2023-01-09-fp32-decoding.png + :alt: Decoding time and decoding result of float32 model + :width: 800 + +.. note:: + + The default option uses 4 threads and ``greedy_search`` for decoding. + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Decode a single wave file with ./build/bin/sherpa-ncnn (with int8 quantization) +::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. note:: + + We also support int8 quantization to compresss the model and speed up inference. + Currently, only encoder and joiner are quantized. + +To decode the int8-quantized model, use the following command: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.int8.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/encoder_jit_trace-pnnx.ncnn.int8.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.int8.param \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/joiner_jit_trace-pnnx.ncnn.int8.bin \ + ./sherpa-ncnn-conv-emformer-transducer-small-2023-01-09/test_wavs/1089-134686-0001.wav \ + +The outputs are shown below. The CPU used for decoding is RV1126 (Quad core ARM Cortex-A7). + +.. figure:: ./pic/2023-01-09-int8-decoding.png + :alt: Decoding time and decoding result of int8 model + :width: 800 + +Compared to the original model in ``fp16`` format, +the decoding speed is significantly improved. The decoding time is changed from +``3.26 s`` to ``2.44 s``. + +.. note:: + + When the model's weights are quantized to ``float16``, it is converted + to ``float32`` during computation. + + When the model's weights are quantized to ``int8``, it is using ``int8`` + during computation. + +.. hint:: + + Even if we use only 1 thread for the ``int8`` model, the resulting real + time factor (RTF) is still less than ``1``. + +.. _sherpa-ncnn-mixed-english-chinese-conv-emformer-model: + +csukuangfj/sherpa-ncnn-conv-emformer-transducer-2022-12-06 (Chinese + English) +------------------------------------------------------------------------------ + +This model is converted from ``_, +which supports both Chinese and English. + +.. hint:: + + If you want to train your own model that is able to support both Chinese and + English, please refer to our training code: + + ``_ + + You can also try the pre-trained models in your browser without installing anything + by visiting: + + ``_ + +In the following, we describe how to download and use it with `sherpa-ncnn`_. + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-conv-emformer-transducer-2022-12-06.tar.bz2 + tar xvf sherpa-ncnn-conv-emformer-transducer-2022-12-06.tar.bz2 + +Decode a single wave file with ./build/bin/sherpa-ncnn +:::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/test_wavs/0.wav \ + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone with build/bin/sherpa-ncnn-microphone +:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn-microphone.exe`` for Windows. + +It will print something like below: + +.. code-block:: + + Number of threads: 4 + num devices: 4 + Use default device: 2 + Name: MacBook Pro Microphone + Max input channels: 1 + Started + +Speak and it will show you the recognition result in real-time. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +csukuangfj/sherpa-ncnn-conv-emformer-transducer-2022-12-08 (Chinese) +-------------------------------------------------------------------- + +.. hint:: + + This is a very small model that can be run in real-time on embedded sytems. + +This model is trained using `WenetSpeech`_ dataset and it supports only Chinese. + +In the following, we describe how to download and use it with `sherpa-ncnn`_. + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-conv-emformer-transducer-2022-12-08.tar.bz2 + tar xvf sherpa-ncnn-conv-emformer-transducer-2022-12-08.tar.bz2 + +Decode a single wave file with ./build/bin/sherpa-ncnn +:::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone with build/bin/sherpa-ncnn-microphone +:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-08/joiner_jit_trace-pnnx.ncnn.bin + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn-microphone.exe`` for Windows. + +It will print something like below: + +.. code-block:: + + Number of threads: 4 + num devices: 4 + Use default device: 2 + Name: MacBook Pro Microphone + Max input channels: 1 + Started + +Speak and it will show you the recognition result in real-time. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +csukuangfj/sherpa-ncnn-conv-emformer-transducer-2022-12-04 (English) +-------------------------------------------------------------------- + +This model is trained using `GigaSpeech`_ and `LibriSpeech`_. It supports only English. + +In the following, we describe how to download and use it with `sherpa-ncnn`_. + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-conv-emformer-transducer-2022-12-04.tar.bz2 + tar xvf sherpa-ncnn-conv-emformer-transducer-2022-12-04.tar.bz2 + +Decode a single wave file with ./build/bin/sherpa-ncnn +:::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/test_wavs/1089-134686-0001.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone with build/bin/sherpa-ncnn-microphone +:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/tokens.txt \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-conv-emformer-transducer-2022-12-04/joiner_jit_trace-pnnx.ncnn.param + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn-microphone.exe`` for Windows. + +It will print something like below: + +.. code-block:: + + Number of threads: 4 + num devices: 4 + Use default device: 2 + Name: MacBook Pro Microphone + Max input channels: 1 + Started + +Speak and it will show you the recognition result in real-time. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. diff --git a/docs/source/ncnn/pretrained_models/index.rst b/docs/source/ncnn/pretrained_models/index.rst new file mode 100644 index 000000000..303d4a87b --- /dev/null +++ b/docs/source/ncnn/pretrained_models/index.rst @@ -0,0 +1,20 @@ +.. _sherpa-ncnn-pre-trained-models: + +Pre-trained models +================== + +In this section, we describe how to download and use all +available pre-trained models. + +.. hint:: + + Please install `git-lfs `_ before you continue. + + +.. toctree:: + :maxdepth: 3 + + small-models.rst + zipformer-transucer-models.rst + lstm-transducer-models + conv-emformer-transducer-models diff --git a/docs/source/ncnn/pretrained_models/lstm-transducer-models.rst b/docs/source/ncnn/pretrained_models/lstm-transducer-models.rst new file mode 100644 index 000000000..fa2ee6fd0 --- /dev/null +++ b/docs/source/ncnn/pretrained_models/lstm-transducer-models.rst @@ -0,0 +1,279 @@ +LSTM-transducer-based Models +============================= + +.. hint:: + + Please refer to :ref:`install_sherpa_ncnn` to install `sherpa-ncnn`_ + before you read this section. + +.. _marcoyang_sherpa_ncnn_lstm_transducer_small_2023_02_13_bilingual: + +marcoyang/sherpa-ncnn-lstm-transducer-small-2023-02-13 (Bilingual, Chinese + English) +-------------------------------------------------------------------------------------- + +This model is a small version of `lstm-transducer `_ +trained in `icefall`_. + +It only has ``13.3 million parameters`` and can be deployed on ``embedded devices`` +for real-time speech recognition. You can find the models in ``fp16`` format +at ``_. + +The model is trained on a bi-lingual dataset ``tal_csasr`` (Chinese + English), so it can be used +for both Chinese and English. + +In the following, we show you how to download it and +deploy it with `sherpa-ncnn`_. + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-lstm-transducer-small-2023-02-13.tar.bz2 + tar xvf sherpa-ncnn-lstm-transducer-small-2023-02-13.tar.bz2 + +.. note:: + + Please refer to :ref:`sherpa-ncnn-embedded-linux-arm-install` for how to + compile `sherpa-ncnn`_ for a 32-bit ARM platform. + +Decode a single wave file with ./build/bin/sherpa-ncnn +:::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/tokens.txt \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-lstm-transducer-small-2023-02-13/test_wavs/0.wav + +.. note:: + + The default option uses 4 threads and ``greedy_search`` for decoding. + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +csukuangfj/sherpa-ncnn-2022-09-05 (English) +------------------------------------------- + +This is a model trained using the `GigaSpeech`_ and the `LibriSpeech`_ dataset. + +Please see ``_ for how the model +is trained. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-2022-09-05.tar.bz2 + tar xvf sherpa-ncnn-2022-09-05.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-2022-09-05/tokens.txt \ + ./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-05/test_wavs/1089-134686-0001.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-lstm/2022-09-05.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-2022-09-05/tokens.txt \ + ./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-05/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-05/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-05/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn-microphone.exe`` for Windows. + +It will print something like below: + +.. code-block:: + + Number of threads: 4 + num devices: 4 + Use default device: 2 + Name: MacBook Pro Microphone + Max input channels: 1 + Started + +Speak and it will show you the recognition result in real-time. + +You can find a demo below: + +.. youtube:: m6ynSxycpX0 + :width: 120% + +csukuangfj/sherpa-ncnn-2022-09-30 (Chinese) +------------------------------------------- + +This is a model trained using the `WenetSpeech`_ dataset. + +Please see ``_ for how the model +is trained. + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-2022-09-30.tar.bz2 + tar xvf sherpa-ncnn-2022-09-30.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-2022-09-30/tokens.txt \ + ./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-30/test_wavs/0.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-lstm/2022-09-30.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-2022-09-30/tokens.txt \ + ./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-30/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-30/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-2022-09-30/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn-microphone.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You can find a demo below: + +.. youtube:: bbQfoRT75oM + :width: 120% diff --git a/docs/source/ncnn/pretrained_models/pic/2023-01-09-fp32-decoding.png b/docs/source/ncnn/pretrained_models/pic/2023-01-09-fp32-decoding.png new file mode 100644 index 000000000..f9882b718 Binary files /dev/null and b/docs/source/ncnn/pretrained_models/pic/2023-01-09-fp32-decoding.png differ diff --git a/docs/source/ncnn/pretrained_models/pic/2023-01-09-int8-decoding.png b/docs/source/ncnn/pretrained_models/pic/2023-01-09-int8-decoding.png new file mode 100644 index 000000000..504a02463 Binary files /dev/null and b/docs/source/ncnn/pretrained_models/pic/2023-01-09-int8-decoding.png differ diff --git a/docs/source/ncnn/pretrained_models/small-models.rst b/docs/source/ncnn/pretrained_models/small-models.rst new file mode 100644 index 000000000..220bd2b8b --- /dev/null +++ b/docs/source/ncnn/pretrained_models/small-models.rst @@ -0,0 +1,20 @@ +Small models +============ + +In this section, we list models with fewer parameters that are suitable for +resource constrained embedded systems. + +.. hint:: + + If you are using Raspberry Pi 4, this section is not so helpful for you + since all models in `sherpa-ncnn`_ are able to run in real-time on it. + + This page is especially useful for systems with less resource than + Raspberry Pi 4. + + +- :ref:`marcoyang_sherpa_ncnn_streaming_zipformer_small_14M_2023_02_23_chinese` +- :ref:`marcoyang_sherpa_ncnn_streaming_zipformer_small_20M_2023_02_17_english` +- :ref:`marcoyang_sherpa_ncnn_conv_emformer_transducer_small_2023_01_09_english` +- :ref:`sherpa_ncnn_streaming_zipformer_small_bilingual_zh_en_2023_02_16` +- :ref:`marcoyang_sherpa_ncnn_lstm_transducer_small_2023_02_13_bilingual` diff --git a/docs/source/ncnn/pretrained_models/zipformer-transucer-models.rst b/docs/source/ncnn/pretrained_models/zipformer-transucer-models.rst new file mode 100644 index 000000000..1f0d7e6a1 --- /dev/null +++ b/docs/source/ncnn/pretrained_models/zipformer-transucer-models.rst @@ -0,0 +1,573 @@ +Zipformer-transducer-based Models +================================= + +.. hint:: + + Please refer to :ref:`install_sherpa_ncnn` to install `sherpa-ncnn`_ + before you read this section. + +.. _marcoyang_sherpa_ncnn_streaming_zipformer_small_14M_2023_02_23_chinese: + +marcoyang/sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23 (Chinese) +--------------------------------------------------------------------- + +This model is a streaming Zipformer model which has around 14 millon parameters. It is trained on the `WenetSpeech`_ corpus +so it supports only Chinese. + +You can find the training code at ``_ + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-ncnn-streaming-zipformer-zh-small-14M-2023-02-23.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-ncnn-alsa` to do real-time speech recognition with your + microphone if ``sherpa-ncnn-microphone`` does not work for you. + +.. _marcoyang_sherpa_ncnn_streaming_zipformer_small_20M_2023_02_17_english: + +marcoyang/sherpa-ncnn-streaming-zipformer-20M-2023-02-17 (English) +------------------------------------------------------------------ + +This model is a streaming Zipformer model converted from + +``_ + +which has around 20 millon parameters. It is trained on the `LibriSpeech`_ corpus so it supports only English. +The word-error-rates(%) on ``test-clean`` is 3.88. + +You can find the training code at ``_ + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-20M-2023-02-17.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-20M-2023-02-17.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/test_wavs/0.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-ncnn-streaming-zipformer-small-20M-en-2023-02-19.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-20M-2023-02-17/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-ncnn-alsa` to do real-time speech recognition with your + microphone if ``sherpa-ncnn-microphone`` does not work for you. + +csukuangfj/sherpa-ncnn-streaming-zipformer-en-2023-02-13 (English) +------------------------------------------------------------------ + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-en-2023-02-13.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-en-2023-02-13.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/test_wavs/1221-135766-0002.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-ncnn-streaming-zipformer-en-2023-02-13-sherpa-ncnn.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-ncnn-alsa` to do real-time speech recognition with your + microphone if ``sherpa-ncnn-microphone`` does not work for you. + +.. _sherpa_ncnn_streaming_zipformer_bilingual_zh_en_2023_02_13: + +csukuangfj/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13 (Bilingual, Chinese + English) +---------------------------------------------------------------------------------------------------- + +This model is converted from + +``_ + +which supports both Chinese and English. The model is contributed by the community +and is trained on tens of thousands of some internal dataset. + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/test_wavs/1.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13-sherpa-ncnn.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-ncnn-alsa` to do real-time speech recognition with your + microphone if ``sherpa-ncnn-microphone`` does not work for you. + + +.. _sherpa_ncnn_streaming_zipformer_small_bilingual_zh_en_2023_02_16: + +csukuangfj/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16 (Bilingual, Chinese + English) +---------------------------------------------------------------------------------------------------------- + +This model is converted from + +``_ + +which supports both Chinese and English. The model is contributed by the community +and is trained on tens of thousands of some internal dataset. + +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +.. note:: + + Unlike :ref:`sherpa_ncnn_streaming_zipformer_bilingual_zh_en_2023_02_13`, this + model is much smaller. + +Download the model +~~~~~~~~~~~~~~~~~~ + + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2 + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files with a single channel and the sampling rate + should be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/1.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner_jit_trace-pnnx.ncnn.bin \ + 2 \ + greedy_search + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-ncnn-alsa` to do real-time speech recognition with your + microphone if ``sherpa-ncnn-microphone`` does not work for you. + +A faster model of sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We provide a second version of the model that is exported with +``--decode-chunk-len=96`` instead of ``32``. + +.. hint:: + + Please see the model export script at + + ``_ + + if you are interested. + + +.. note:: + + You can also find a third version with folder ``64``. + +The advantage of using this model is that it runs much faster, while the downside +is that you will see some delay before you see the recognition result after you speak. + +To decode a file, please use: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/96/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/1.wav \ + 2 \ + $method + done + +.. _sherpa_ncnn_streaming_zipformer_fr_2023_04_14: + +shaojieli/sherpa-ncnn-streaming-zipformer-fr-2023-04-14 +------------------------------------------------------- + +This model is converted from + +``_ + +which supports only French as it is trained on the `CommonVoice`_ corpus. +In the following, we describe how to download it and use it with `sherpa-ncnn`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + + wget https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-fr-2023-04-14.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-fr-2023-04-14.tar.bz2 + +To decode a file, please use: + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + for method in greedy_search modified_beam_search; do + ./build/bin/sherpa-ncnn \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav \ + 2 \ + $method + done + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-ncnn-streaming-zipformer-fr-2023-04-14.txt + +.. note:: + + Please use ``./build/bin/Release/sherpa-ncnn.exe`` for Windows. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-ncnn + ./build/bin/sherpa-ncnn-microphone \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/tokens.txt \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/encoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/decoder_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.param \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/joiner_jit_trace-pnnx.ncnn.bin \ + ./sherpa-ncnn-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav \ + 2 \ + greedy_search + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-ncnn-alsa` to do real-time speech recognition with your + microphone if ``sherpa-ncnn-microphone`` does not work for you. diff --git a/docs/source/ncnn/python/code/decode-file.py b/docs/source/ncnn/python/code/decode-file.py new file mode 100755 index 000000000..62a8e565b --- /dev/null +++ b/docs/source/ncnn/python/code/decode-file.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +""" +This file demonstrates how to use sherpa-ncnn Python API to recognize +a single file. + +Please refer to +https://k2-fsa.github.io/sherpa/ncnn/index.html +to install sherpa-ncnn and to download the pre-trained models +used in this file. +""" + +import wave + +import numpy as np +import sherpa_ncnn + + +def main(): + recognizer = sherpa_ncnn.Recognizer( + tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt", + encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param", + encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin", + decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param", + decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin", + joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param", + joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin", + num_threads=4, + ) + + filename = ( + "./sherpa-ncnn-conv-emformer-transducer-2022-12-06/test_wavs/1.wav" + ) + with wave.open(filename) as f: + assert f.getframerate() == recognizer.sample_rate, ( + f.getframerate(), + recognizer.sample_rate, + ) + assert f.getnchannels() == 1, f.getnchannels() + assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes + num_samples = f.getnframes() + samples = f.readframes(num_samples) + samples_int16 = np.frombuffer(samples, dtype=np.int16) + samples_float32 = samples_int16.astype(np.float32) + + samples_float32 = samples_float32 / 32768 + + recognizer.accept_waveform(recognizer.sample_rate, samples_float32) + + tail_paddings = np.zeros( + int(recognizer.sample_rate * 0.5), dtype=np.float32 + ) + recognizer.accept_waveform(recognizer.sample_rate, tail_paddings) + + recognizer.input_finished() + + print(recognizer.text) + + +if __name__ == "__main__": + main() diff --git a/docs/source/ncnn/python/code/speech-recognition-from-microphone.py b/docs/source/ncnn/python/code/speech-recognition-from-microphone.py new file mode 100755 index 000000000..922a06987 --- /dev/null +++ b/docs/source/ncnn/python/code/speech-recognition-from-microphone.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +# Real-time speech recognition from a microphone with sherpa-ncnn Python API +# +# Please refer to +# https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html +# to download pre-trained models + +import sys + +try: + import sounddevice as sd +except ImportError as e: + print("Please install sounddevice first. You can use") + print() + print(" pip install sounddevice") + print() + print("to install it") + sys.exit(-1) + +import sherpa_ncnn + + +def create_recognizer(): + # Please replace the model files if needed. + # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html + # for download links. + recognizer = sherpa_ncnn.Recognizer( + tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt", + encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param", + encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin", + decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param", + decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin", + joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param", + joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin", + num_threads=4, + ) + return recognizer + + +def main(): + print("Started! Please speak") + recognizer = create_recognizer() + sample_rate = recognizer.sample_rate + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms + last_result = "" + with sd.InputStream( + channels=1, dtype="float32", samplerate=sample_rate + ) as s: + while True: + samples, _ = s.read(samples_per_read) # a blocking read + samples = samples.reshape(-1) + recognizer.accept_waveform(sample_rate, samples) + result = recognizer.text + if last_result != result: + last_result = result + print(result) + + +if __name__ == "__main__": + devices = sd.query_devices() + print(devices) + default_input_device_idx = sd.default.device[0] + print(f'Use default device: {devices[default_input_device_idx]["name"]}') + + try: + main() + except KeyboardInterrupt: + print("\nCaught Ctrl + C. Exiting") diff --git a/docs/source/ncnn/python/index.rst b/docs/source/ncnn/python/index.rst new file mode 100644 index 000000000..95b75d357 --- /dev/null +++ b/docs/source/ncnn/python/index.rst @@ -0,0 +1,360 @@ +.. _sherpa-ncnn-python-api: + +Python API +========== + +.. hint:: + + It is known to work for ``Python >= 3.6`` on Linux, macOS, and Windows. + +In this section, we describe + + 1. How to install the Python package `sherpa-ncnn`_ + 2. How to use `sherpa-ncnn`_ Python API for real-time speech recognition with + a microphone + 3. How to use `sherpa-ncnn`_ Python API to recognize a single file + +Installation +------------ + +You can use ``1`` of the ``4`` methods below to install the Python package `sherpa-ncnn`_: + +Method 1 +^^^^^^^^ + +.. hint:: + + This method supports ``x86_64``, ``arm64`` (e.g., Mac M1, 64-bit Raspberry Pi), + and ``arm32`` (e.g., 32-bit Raspberry Pi). + +.. code-block:: bash + + pip install sherpa-ncnn + + +If you use ``Method 1``, it will install pre-compiled libraries. +The ``disadvantage`` is that it may ``not be optimized`` for your platform, +while the ``advantage`` is that you don't need to install ``cmake`` or a +C++ compiler. + +For the following methods, you have to first install: + +- ``cmake``, which can be installed using ``pip install cmake`` +- A C++ compiler, e.g., GCC on Linux and macOS, Visual Studio on Windows + +Method 2 +^^^^^^^^ + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + python3 setup.py install + +Method 3 +^^^^^^^^ + +.. code-block:: bash + + pip install git+https://github.com/k2-fsa/sherpa-ncnn + + +Method 4 (For developers and embedded boards) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. tabs:: + + .. tab:: x86/x86_64 + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + + cmake \ + -D SHERPA_NCNN_ENABLE_PYTHON=ON \ + -D SHERPA_NCNN_ENABLE_PORTAUDIO=OFF \ + -D BUILD_SHARED_LIBS=ON \ + .. + + make -j6 + + export PYTHONPATH=$PWD/lib:$PWD/../sherpa-ncnn/python:$PYTHONPATH + + .. tab:: 32-bit ARM + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + + cmake \ + -D SHERPA_NCNN_ENABLE_PYTHON=ON \ + -D SHERPA_NCNN_ENABLE_PORTAUDIO=OFF \ + -D BUILD_SHARED_LIBS=ON \ + -DCMAKE_C_FLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon" \ + -DCMAKE_CXX_FLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon" \ + .. + + make -j6 + + export PYTHONPATH=$PWD/lib:$PWD/../sherpa-ncnn/python:$PYTHONPATH + + .. tab:: 64-bit ARM + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + mkdir build + cd build + + cmake \ + -D SHERPA_NCNN_ENABLE_PYTHON=ON \ + -D SHERPA_NCNN_ENABLE_PORTAUDIO=OFF \ + -D BUILD_SHARED_LIBS=ON \ + -DCMAKE_C_FLAGS="-march=armv8-a" \ + -DCMAKE_CXX_FLAGS="-march=armv8-a" \ + .. + + make -j6 + + export PYTHONPATH=$PWD/lib:$PWD/../sherpa-ncnn/python:$PYTHONPATH + +Let us check whether `sherpa-ncnn`_ was installed successfully: + +.. code-block:: bash + + python3 -c "import sherpa_ncnn; print(sherpa_ncnn.__file__)" + python3 -c "import _sherpa_ncnn; print(_sherpa_ncnn.__file__)" + +They should print the location of ``sherpa_ncnn`` and ``_sherpa_ncnn``. + +.. hint:: + + If you use ``Method 1``, ``Method 2``, and ``Method 3``, you can also use + + .. code-block:: bash + + python3 -c "import sherpa_ncnn; print(sherpa_ncnn.__version__)" + + It should print the version of `sherpa-ncnn`_, e.g., ``1.1``. + + +Next, we describe how to use `sherpa-ncnn`_ Python API for speech recognition: + + - (1) Real-time speech recognition with a microphone + - (2) Recognize a file + +Real-time recognition with a microphone +--------------------------------------- + +The following Python code shows how to use `sherpa-ncnn`_ Python API for +real-time speech recognition with a microphone. + +.. hint:: + + We use `sounddevice `_ + for recording. Please run ``pip install sounddevice`` before you run the + code below. + +.. note:: + + You can download the code from + + ``_ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 9-67 + :caption: Real-time speech recognition with a microphone using `sherpa-ncnn`_ Python API + +**Code explanation**: + +1. Import the required packages +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 11-21 + +Two packages are imported: + + - `sounddevice `_, for recording with a microphone + - `sherpa-ncnn`_, for real-time speech recognition + +2. Create the recognizer +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 24-38,40-43 + +We use the model :ref:`sherpa-ncnn-mixed-english-chinese-conv-emformer-model` +as an example, which is able to recognize both English and Chinese. +You can replace it with other pre-trained models. + +Please refer to :ref:`sherpa-ncnn-pre-trained-models` for more models. + +.. hint:: + + The above example uses a ``float16`` encoder and joiner. You can also use + the following code to switch to ``8-bit`` (i.e., ``int8``) quantized encoder + and joiner. + + .. code-block:: python + + recognizer = sherpa_ncnn.Recognizer( + tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt", + encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.int8.param", + encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.int8.bin", + decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param", + decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin", + joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.int8.param", + joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.int8.bin", + num_threads=4, + ) + +3. Start recording +^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 44,47 + +**Note that**: + + - We set channel to 1 since the model supports only a single channel + - We use dtype ``float32`` so that the resulting audio samples are normalized + to the range ``[-1, 1]``. + - The sampling rate has to be ``recognizer.sample_rate``, which is 16 kHz for + all models at present. + +4. Read audio samples from the microphone +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 45,49-50 + +**Note that**: + + - It reads ``100 ms`` of audio samples at a time. You can choose a larger + value, e.g., ``200 ms``. + - No queue or callback is used. Instead, we use a blocking read here. + - The ``samples`` array is reshaped to a ``1-D`` array + +5. Invoke the recognizer with audio samples +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 51 + +**Note that**: + + - ``samples`` has to be a 1-D tensor and should be normalized to the range + ``[-1, 1]``. + - Upon accepting the audio samples, the recognizer starts the decoding + automatically. There is no separate call for decoding. + +6. Get the recognition result +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ./code/speech-recognition-from-microphone.py + :language: python + :lines: 52-55 + +We use ``recognizer.text`` to get the recognition result. To avoid +unnecessary output, we compare whether there is new result in ``recognizer.text`` +and don't print to the console if there is nothing new recognized. + +That's it! + +Summary +^^^^^^^ + +In summary, you need to: + + 1. Create the recognizer + 2. Start recording + 3. Read audio samples + 4. Call ``recognizer.accept_waveform(sample_rate, samples)`` + 5. Call ``recognizer.text`` to get the recognition result + +The following is a YouTube video for demonstration. + +.. youtube:: 74SxVueROok + :width: 120% + + +.. hint:: + + If you don't have access to YouTube, please see the following video from bilibili: + + .. raw:: html + + + + +.. note:: + + ``_ supports endpoint detection. + + Please see the following video for its usage: + + .. raw:: html + + + + +Recognize a file +---------------- + +The following Python code shows how to use `sherpa-ncnn`_ Python API to +recognize a wave file. + +.. caution:: + + The sampling rate of the wave file has to be 16 kHz. Also, it should + contain only a single channel and samples should be 16-bit (i.e., int16) + encoded. + +.. note:: + + You can download the code from + + ``_ + +.. literalinclude:: ./code/decode-file.py + :language: python + :lines: 13-61 + :caption: Decode a file with `sherpa-ncnn`_ Python API + +We use the model :ref:`sherpa-ncnn-mixed-english-chinese-conv-emformer-model` +as an example, which is able to recognize both English and Chinese. +You can replace it with other pre-trained models. + +Please refer to :ref:`sherpa-ncnn-pre-trained-models` for more models. + +.. hint:: + + The above example uses a ``float16`` encoder and joiner. You can also use + the following code to switch to ``8-bit`` (i.e., ``int8``) quantized encoder + and joiner. + + .. code-block:: python + + recognizer = sherpa_ncnn.Recognizer( + tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt", + encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.int8.param", + encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.int8.bin", + decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param", + decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin", + joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.int8.param", + joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.int8.bin", + num_threads=4, + ) diff --git a/docs/source/ncnn/tutorials/cn.rst b/docs/source/ncnn/tutorials/cn.rst new file mode 100644 index 000000000..3dcebc969 --- /dev/null +++ b/docs/source/ncnn/tutorials/cn.rst @@ -0,0 +1,65 @@ +中文资料 (Chinese tutorials) +============================ + +Sherpa-ncnn (Windows電腦抓取麥克風聲音即時轉錄文字) +--------------------------------------------------- + +详细地址为 ``_. + +注: 用的是繁体字. + + +2024 Android整合SherpaNcnn实现离线语音识别(支持中文,手把手带你从编译动态库开始) +----------------------------------------------------------------------------------------------------- + +详细地址为 ``_ + +描述了如何在 Android 上使用 `sherpa-ncnn`_. + + +2024 在Unity环境下,借助sherpa-ncnn框架,实现实时并准确的中英双语语音识别功能 +----------------------------------------------------------------------------- + +详细地址为 ``_ + +使用 unity. + +2024-02-22【RV1126】移植sherpa实时语音识别和TTS文字转语音功能 +------------------------------------------------------------- + +详细地址为 ``_. + +介绍了如何交叉编译 `sherpa-ncnn`_ 并在正点原子的rv1126开发板上测试实时的本地语音识别。 + +.. note:: + + 上述博客中,虽然标题有 TTS, 但是 `sherpa-ncnn`_ 目前只支持 ASR。请使用 `sherpa-onnx`_ + 去运行 TTS。 + +2023-12-31 离线语音识别 sherpa-ncnn 尝鲜体验 +-------------------------------------------- + +详细地址为 ``_. + +介绍了如何在 Ubuntu (x64) 以及树莓派4B 上安装和使用。 + +.. note:: + + 上述博客中,树莓派4B 安装了32位的操作系统。我们建议安装 64 位的操作系统。 + + +2023-04-26【RV1126】移植kaldi实时语音识别 +----------------------------------------- + +详细地址为 ``_. + +描述了如何交叉编译 `sherpa-ncnn`_ 以及如何在 RV1126 开发板上部署。 + +写的非常详细! + +2023-02-19 离线语音识别库sherpa-ncnn安装和简单测试笔记 +------------------------------------------------------------ + +详细地址为 ``_. + +使用了树莓派3B+ 和 Windows 进行测试。 diff --git a/docs/source/ncnn/tutorials/index.rst b/docs/source/ncnn/tutorials/index.rst new file mode 100644 index 000000000..68a4b8278 --- /dev/null +++ b/docs/source/ncnn/tutorials/index.rst @@ -0,0 +1,14 @@ +Tutorials +========= + +This page contains links to tutorials written by our users. + +.. caution:: + + The tutorials are not necessarily written in English. + + +.. toctree:: + :maxdepth: 2 + + ./cn.rst diff --git a/docs/source/ncnn/wasm/build.rst b/docs/source/ncnn/wasm/build.rst new file mode 100644 index 000000000..75539873a --- /dev/null +++ b/docs/source/ncnn/wasm/build.rst @@ -0,0 +1,86 @@ +Build +===== + +After installing `emscripten`_, we can build `sherpa-ncnn`_ for `WebAssembly`_ now. + +Please use the following command to build it: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-ncnn + cd sherpa-ncnn + + cd wasm/assets + wget -q https://github.com/k2-fsa/sherpa-ncnn/releases/download/models/sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13.tar.bz2 + tar xvf sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13.tar.bz2 + mv -v sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/*pnnx.ncnn.param . + mv -v sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/*pnnx.ncnn.bin . + mv -v sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13/tokens.txt . + rm -rf sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13 + rm -v sherpa-ncnn-streaming-zipformer-bilingual-zh-en-2023-02-13.tar.bz2 + cd ../.. + + ./build-wasm-simd.sh + +.. hint:: + + You can visit ``_ + to download a different model. + +After building, you should see the following output: + +.. code-block:: bash + + Install the project... + -- Install configuration: "Release" + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/lib/libkaldi-native-fbank-core.a + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/lib/libncnn.a + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/./sherpa-ncnn.pc + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/lib/libsherpa-ncnn-core.a + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/lib/libsherpa-ncnn-c-api.a + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/include/sherpa-ncnn/c-api/c-api.h + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/sherpa-ncnn-wasm-main.js + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/sherpa-ncnn.js + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/app.js + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/index.html + -- Up-to-date: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/sherpa-ncnn-wasm-main.js + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/sherpa-ncnn-wasm-main.wasm + -- Installing: /Users/fangjun/open-source/sherpa-ncnn/build-wasm-simd/install/bin/wasm/sherpa-ncnn-wasm-main.data + + ls -lh install/bin/wasm + total 280152 + -rw-r--r-- 1 fangjun staff 9.0K Feb 6 15:42 app.js + -rw-r--r-- 1 fangjun staff 936B Feb 6 15:42 index.html + -rw-r--r-- 1 fangjun staff 135M Feb 6 17:06 sherpa-ncnn-wasm-main.data + -rw-r--r-- 1 fangjun staff 79K Feb 6 17:06 sherpa-ncnn-wasm-main.js + -rw-r--r-- 1 fangjun staff 1.7M Feb 6 17:06 sherpa-ncnn-wasm-main.wasm + -rw-r--r-- 1 fangjun staff 6.9K Feb 6 15:42 sherpa-ncnn.js + +Now you can use the following command to run it: + +.. code-block:: bash + + cd build-wasm-simd/install/bin/wasm/ + python3 -m http.server 6006 + +Start your browser and visit ``_; you should see the following +page: + +.. figure:: ./pic/wasm-sherpa-ncnn-1.png + :alt: start page of wasm + :width: 800 + +Now click start and speak! You should see the recognition results in the text box. + +.. warning:: + + We are using a bilingual model (Chinese + English) in the above example, which means + you can only speak Chinese or English in this case. + +A screenshot is given below: + +.. figure:: ./pic/wasm-sherpa-ncnn-2.png + :alt: recognition result + :width: 800 + +Congratulations! You have successfully run real-time speech recognition with `WebAssembly`_ +in your browser. diff --git a/docs/source/ncnn/wasm/hf-spaces.rst b/docs/source/ncnn/wasm/hf-spaces.rst new file mode 100644 index 000000000..7525232e1 --- /dev/null +++ b/docs/source/ncnn/wasm/hf-spaces.rst @@ -0,0 +1,49 @@ +.. _try sherpa ncnn wasm with huggingface: + +Huggingface Spaces (WebAssembly) +================================ + +We provide two `Huggingface`_ spaces so that you can try real-time +speech recognition with `WebAssembly`_ in your browser. + +English only +------------ + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-en.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-ncnn-en + +.. note:: + + The script for building this space can be found at + ``_ + +Chinese + English +----------------- + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-zh-en.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-ncnn-zh-en + +.. note:: + + The script for building this space can be found at + ``_ diff --git a/docs/source/ncnn/wasm/index.rst b/docs/source/ncnn/wasm/index.rst new file mode 100644 index 000000000..532d78e35 --- /dev/null +++ b/docs/source/ncnn/wasm/index.rst @@ -0,0 +1,15 @@ +WebAssembly +=========== + +In this section, we describe how to build `sherpa-ncnn`_ for `WebAssembly`_ +so that you can run real-time speech recognition with `WebAssembly`_. + +Please follow the steps below to build and run `sherpa-ncnn`_ for `WebAssembly`_. + +.. toctree:: + :maxdepth: 3 + + ./install-emscripten.rst + ./build.rst + ./prebuilt.rst + ./hf-spaces.rst diff --git a/docs/source/ncnn/wasm/install-emscripten.rst b/docs/source/ncnn/wasm/install-emscripten.rst new file mode 100644 index 000000000..3fdbc0b4f --- /dev/null +++ b/docs/source/ncnn/wasm/install-emscripten.rst @@ -0,0 +1,40 @@ +Install Emscripten +================== + +We need to compile the C/C++ files in `sherpa-ncnn`_ with the help of +`emscripten`_. + +Please refer to ``_ +for detailed installation instructions. + +The following is an example to show you how to install it on Linux/macOS. + +.. code-block:: bash + + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + git pull + ./emsdk install latest + ./emsdk activate latest + source ./emsdk_env.sh + +To check that you have installed `emscripten`_ successfully, please run: + +.. code-block:: bash + + emcc -v + +The above command should print something like below: + +.. code-block:: + + emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.48 (e967e20b4727956a30592165a3c1cde5c67fa0a8) + shared:INFO: (Emscripten: Running sanity checks) + (py38) fangjuns-MacBook-Pro:open-source fangjun$ emcc -v + emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.48 (e967e20b4727956a30592165a3c1cde5c67fa0a8) + clang version 18.0.0 (https://github.com/llvm/llvm-project a54545ba6514802178cf7cf1c1dd9f7efbf3cde7) + Target: wasm32-unknown-emscripten + Thread model: posix + InstalledDir: /Users/fangjun/open-source/emsdk/upstream/bin + +Congratulations! You have successfully installed `emscripten`_. diff --git a/docs/source/ncnn/wasm/pic/wasm-hf-en.png b/docs/source/ncnn/wasm/pic/wasm-hf-en.png new file mode 100644 index 000000000..725281fc6 Binary files /dev/null and b/docs/source/ncnn/wasm/pic/wasm-hf-en.png differ diff --git a/docs/source/ncnn/wasm/pic/wasm-hf-zh-en.png b/docs/source/ncnn/wasm/pic/wasm-hf-zh-en.png new file mode 100644 index 000000000..e0ce14bb9 Binary files /dev/null and b/docs/source/ncnn/wasm/pic/wasm-hf-zh-en.png differ diff --git a/docs/source/ncnn/wasm/pic/wasm-sherpa-ncnn-1.png b/docs/source/ncnn/wasm/pic/wasm-sherpa-ncnn-1.png new file mode 100644 index 000000000..66450817a Binary files /dev/null and b/docs/source/ncnn/wasm/pic/wasm-sherpa-ncnn-1.png differ diff --git a/docs/source/ncnn/wasm/pic/wasm-sherpa-ncnn-2.png b/docs/source/ncnn/wasm/pic/wasm-sherpa-ncnn-2.png new file mode 100644 index 000000000..6bd4f644b Binary files /dev/null and b/docs/source/ncnn/wasm/pic/wasm-sherpa-ncnn-2.png differ diff --git a/docs/source/ncnn/wasm/prebuilt.rst b/docs/source/ncnn/wasm/prebuilt.rst new file mode 100644 index 000000000..fea8954d6 --- /dev/null +++ b/docs/source/ncnn/wasm/prebuilt.rst @@ -0,0 +1,54 @@ +Use pre-built WebAssembly library +================================= + +In this section, we describe how to use the pre-built `WebAssembly`_ library +of `sherpa-ncnn`_ for real-time speech recognition. + +.. note:: + + Note that the pre-built library used in this section + uses a bilingual model (Chinese + English), + which is from :ref:`sherpa_ncnn_streaming_zipformer_bilingual_zh_en_2023_02_13`. + +Download +-------- + +Please use the following command to download the pre-built library for version +``v2.1.7``, which is the latest release as of 2024.02.06. + +.. hint:: + + Please always use the latest release. You can visit + ``_ to find the latest release. + +.. code-block:: + + wget -q https://github.com/k2-fsa/sherpa-ncnn/releases/download/v2.1.7/sherpa-ncnn-wasm-simd-v2.1.7.tar.bz2 + tar xvf sherpa-ncnn-wasm-simd-v2.1.7.tar.bz2 + rm sherpa-ncnn-wasm-simd-v2.1.7.tar.bz2 + cd sherpa-ncnn-wasm-simd-v2.1.7 + + python3 -m http.server 6006 + +Start your browser and visit ``_; you should see the following +page: + +.. figure:: ./pic/wasm-sherpa-ncnn-1.png + :alt: start page of wasm + :width: 800 + +Now click start and speak! You should see the recognition results in the text box. + +.. warning:: + + We are using a bilingual model (Chinese + English) in the above example, which means + you can only speak Chinese or English in this case. + +A screenshot is given below: + +.. figure:: ./pic/wasm-sherpa-ncnn-2.png + :alt: recognition result + :width: 800 + +Congratulations! You have successfully run real-time speech recognition with `WebAssembly`_ +in your browser. diff --git a/docs/source/onnx/FireRedAsr/code/2025-02-16.txt b/docs/source/onnx/FireRedAsr/code/2025-02-16.txt new file mode 100644 index 000000000..6df9e7044 --- /dev/null +++ b/docs/source/onnx/FireRedAsr/code/2025-02-16.txt @@ -0,0 +1,14 @@ +/star-fj/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/tokens.txt --fire-red-asr-encoder=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/encoder.int8.onnx --fire-red-asr-decoder=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/decoder.int8.onnx --num-threads=1 ./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), fire_red_asr=OfflineFireRedAsrModelConfig(encoder="./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/encoder.int8.onnx", decoder="./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/decoder.int8.onnx"), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), moonshine=OfflineMoonshineModelConfig(preprocessor="", encoder="", uncached_decoder="", cached_decoder=""), telespeech_ctc="", tokens="./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav +{"lang": "", "emotion": "", "event": "", "text": "昨天是 MONDAY TODAY IS礼拜二 THE DAY AFTER TOMORROW是星期三", "timestamps": [], "tokens":["昨", "天", "是", " MO", "ND", "AY", " TO", "D", "AY", " IS", "礼", "拜", "二", " THE", " DAY", " AFTER", " TO", "M", "OR", "ROW", "是", "星", "期", "三"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 19.555 s +Real time factor (RTF): 19.555 / 10.053 = 1.945 diff --git a/docs/source/onnx/FireRedAsr/huggingface-space.rst b/docs/source/onnx/FireRedAsr/huggingface-space.rst new file mode 100644 index 000000000..f3a12b3d9 --- /dev/null +++ b/docs/source/onnx/FireRedAsr/huggingface-space.rst @@ -0,0 +1,20 @@ +Huggingface space +================= + +You can try `FireRedAsr`_ with `sherpa-onnx`_ with the following huggingface space + + ``_ + + +.. hint:: + + You don't need to install anything. All you need is a browser. + + You can even run it on your phone or tablet. + +.. figure:: ./pic/fire-red-asr-hf-space.jpg + :alt: screenshot of hf space for FireRedAsr + :align: center + :width: 600 + + Try `FireRedAsr`_ in our Huggingface space with `sherpa-onnx`_ diff --git a/docs/source/onnx/FireRedAsr/index.rst b/docs/source/onnx/FireRedAsr/index.rst new file mode 100644 index 000000000..4146f66ee --- /dev/null +++ b/docs/source/onnx/FireRedAsr/index.rst @@ -0,0 +1,41 @@ +FireRedAsr +========== + +This section describes how to use models from ``_. + +Note that this model supports Chinese and English. + +.. hint:: + + 该模型支持普通话、及一些方言(四川话、河南话、天津话等). + +We have converted `FireRedASR`_ to onnx and provided APIs for the following programming languages + + - 1. C++ + - 2. C + - 3. Python + - 4. C# + - 5. Go + - 6. Kotlin + - 7. Java + - 8. JavaScript (Support `WebAssembly`_ and `Node`_) + - 9. Swift + - 10. `Dart`_ (Support `Flutter`_) + - 11. Object Pascal + +Note that you can use `FireRedASR`_ with `sherpa-onnx`_ on the following platforms: + + - Linux (x64, aarch64, arm, riscv64) + - macOS (x64, arm64) + - Windows (x64, x86, arm64) + - Android (arm64-v8a, armv7-eabi, x86, x86_64) + - iOS (arm64) + +In the following, we describe how to download pre-trained `FireRedASR`_ models +and use them in `sherpa-onnx`_. + +.. toctree:: + :maxdepth: 5 + + ./huggingface-space.rst + ./pretrained.rst diff --git a/docs/source/onnx/FireRedAsr/pic/fire-red-asr-hf-space.jpg b/docs/source/onnx/FireRedAsr/pic/fire-red-asr-hf-space.jpg new file mode 100644 index 000000000..bee78e2fc Binary files /dev/null and b/docs/source/onnx/FireRedAsr/pic/fire-red-asr-hf-space.jpg differ diff --git a/docs/source/onnx/FireRedAsr/pretrained.rst b/docs/source/onnx/FireRedAsr/pretrained.rst new file mode 100644 index 000000000..6e250d32b --- /dev/null +++ b/docs/source/onnx/FireRedAsr/pretrained.rst @@ -0,0 +1,66 @@ +Pre-trained Models +================== + +This page describes how to download pre-trained `FireRedAsr`_ models. + +sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16 (Chinese + English, 普通话、四川话、河南话等) +------------------------------------------------------------------------------------------------ + +This model is converted from ``_ + +It supports the following 2 languages: + + - Chinese (普通话, 四川话、天津话、河南话等方言) + - English + +In the following, we describe how to download it. + +Download +^^^^^^^^ + +Please use the following commands to download it:: + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2 + tar xvf sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2 + rm sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2 + +After downloading, you should find the following files:: + + ls -lh sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/ + total 1.7G + -rw-r--r-- 1 kuangfangjun root 188 Feb 16 16:22 README.md + -rw-r--r-- 1 kuangfangjun root 425M Feb 16 16:21 decoder.int8.onnx + -rw-r--r-- 1 kuangfangjun root 1.3G Feb 16 16:21 encoder.int8.onnx + drwxr-xr-x 10 kuangfangjun root 0 Feb 16 16:26 test_wavs + -rw-r--r-- 1 kuangfangjun root 70K Feb 16 16:21 tokens.txt + + ls -lh sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/ + total 1.9M + -rw-r--r-- 1 kuangfangjun root 315K Feb 16 16:24 0.wav + -rw-r--r-- 1 kuangfangjun root 160K Feb 16 16:24 1.wav + -rw-r--r-- 1 kuangfangjun root 147K Feb 16 16:24 2.wav + -rw-r--r-- 1 kuangfangjun root 245K Feb 16 16:25 3-sichuan.wav + -rw-r--r-- 1 kuangfangjun root 276K Feb 16 16:24 3.wav + -rw-r--r-- 1 kuangfangjun root 245K Feb 16 16:25 4-tianjin.wav + -rw-r--r-- 1 kuangfangjun root 250K Feb 16 16:26 5-henan.wav + -rw-r--r-- 1 kuangfangjun root 276K Feb 16 16:24 8k.wav + +Decode a file +^^^^^^^^^^^^^ + +Please use the following command to decode a wave file: + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/tokens.txt \ + --fire-red-asr-encoder=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/encoder.int8.onnx \ + --fire-red-asr-decoder=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/decoder.int8.onnx \ + --num-threads=1 \ + ./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav + +You should see the following output: + +.. literalinclude:: ./code/2025-02-16.txt diff --git a/docs/source/onnx/android/build-sherpa-onnx.rst b/docs/source/onnx/android/build-sherpa-onnx.rst new file mode 100644 index 000000000..c4caea3fa --- /dev/null +++ b/docs/source/onnx/android/build-sherpa-onnx.rst @@ -0,0 +1,442 @@ +.. _sherpa-onnx-install-android-studio: + +Build sherpa-onnx for Android +============================= + +You can use this section for both ``speech-to-text`` (STT, ASR) +and ``text-to-speech`` (TTS). + +.. hint:: + + The build scripts mentioned in this section run on both Linux and macOS. + + If you are using Windows or if you don't want to build the shared libraries, + you can download pre-built shared libraries by visiting the release page + ``_ + + For instance, for the relase ``v1.10.19``, you can visit + ``_ + and download the file ``sherpa-onnx-v1.10.19-android.tar.bz2`` + using the following command: + + .. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.10.19/sherpa-onnx-v1.10.19-android.tar.bz2 + + Please always use the latest release. + +.. hint:: + + This section is originally written for speech-to-text. However, it is + also applicable to other folders in ``_. + + For instance, you can replace ``SherpaOnnx`` in this section with + + - ``SherpaOnnx2Pass`` + - ``SherpaOnnxTts`` (this is for text-to-speech) + - ``SherpaOnnxTtsEngine`` (this is for text-to-speech) + - ``SherpaOnnxVad`` + - ``SherpaOnnxVadAsr`` + - ``SherpaOnnxSpeakerIdentification`` + - ``SherpaOnnxSpeakerDiarization`` + - ``SherpaOnnxAudioTagging`` + - ``SherpaOnnxAudioTaggingWearOs`` + + +Install Android Studio +---------------------- + +The first step is to download and install Android Studio. + +Please refer to ``_ for how to install +Android Studio. + +.. hint:: + + Any recent version of Android Studio should work fine. Also, you can use + the default settings of Android Studio during installation. + + For reference, we post the version we are using below: + + .. image:: ./pic/android-studio-version.png + :align: center + :alt: screenshot of my version of Android Studio + :width: 600 + + +Download sherpa-onnx +-------------------- + +Next, download the source code of `sherpa-onnx`_: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + +Install NDK +----------- + +Step 1, start Android Studio. + + .. figure:: ./pic/start-android-studio.png + :alt: Start Android Studio + :width: 600 + + Step 1: Click ``Open`` to select ``sherpa-onnx/android/SherpaOnnx`` + +Step 2, Open ``sherpa-onnx/android/SherpaOnnx``. + + .. figure:: ./pic/open-sherpa-onnx.png + :alt: Open SherpaOnnx + :width: 600 + + Step 2: Open ``SherpaOnnx``. + + +Step 3, Select ``Tools -> SDK Manager``. + + .. figure:: ./pic/select-sdk-manager.png + :alt: Select Tools -> SDK Manager + :width: 600 + + Step 3: Select ``Tools -> SDK Manager``. + +Step 4, ``Install NDK``. + + .. figure:: ./pic/ndk-tools.png + :alt: Install NDK + :width: 600 + + Step 4: Install NDK. + +In the following, we assume ``Android SDK location`` was set to +``/Users/fangjun/software/my-android``. You can change it accordingly below. + +After installing NDK, you can find it in + +.. code-block:: + + /Users/fangjun/software/my-android/ndk/22.1.7171670 + +.. warning:: + + If you selected a different version of NDK, please replace ``22.1.7171670`` + accordingly. + +Next, let us set the environment variable ``ANDROID_NDK`` for later use. + +.. code-block:: bash + + export ANDROID_NDK=/Users/fangjun/software/my-android/ndk/22.1.7171670 + +.. note:: + + Note from https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-android + + (Important) remove the hardcoded debug flag in Android NDK to fix + the android-ndk issue: https://github.com/android/ndk/issues/243 + + 1. open ``$ANDROID_NDK/build/cmake/android.toolchain.cmake`` for ndk < r23 + or ``$ANDROID_NDK/build/cmake/android-legacy.toolchain.cmake`` for ndk >= r23 + + 2. delete the line containing "-g" + + .. code-block:: + + list(APPEND ANDROID_COMPILER_FLAGS + -g + -DANDROID + +Build sherpa-onnx (C++ code) +---------------------------- + +After installing ``NDK``, it is time to build the C++ code of `sherpa-onnx`_. + +In the following, we show how to build `sherpa-onnx`_ for the following +Android ABIs: + + - ``arm64-v8a`` + - ``armv7-eabi`` + - ``x86_64`` + - ``x86`` + +.. caution:: + + You only need to select one and only one ABI. ``arm64-v8a`` is probably the + most common one. + + If you want to test the app on an emulator, you probably need ``x86_64``. + +.. hint:: + + Building scripts for this section are for macOS and Linux. If you are + using Windows or if you don't want to build the shared libraries by yourself, + you can download pre-compiled shared libraries for this section by visiting + + ``_ + +.. hint:: + + We provide a colab notebook + |build sherpa-onnx for android colab notebook| + for you to try this section step by step. + + If you are using Windows or you don't want to setup your local environment + to build the C++ libraries, please use the above colab notebook. + +.. |build sherpa-onnx for android colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/build_sherpa_onnx_for_android.ipynb + +Build for arm64-v8a +^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-onnx # Go to the root repo + ./build-android-arm64-v8a.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + ls -lh build-android-arm64-v8a/install/lib/ + + -rw-r--r-- 1 fangjun staff 15M Jul 28 12:54 libonnxruntime.so + -rwxr-xr-x 1 fangjun staff 3.7M Jul 28 12:54 libsherpa-onnx-jni.so + +Please copy them to ``android/SherpaOnnx/app/src/main/jniLibs/arm64-v8a/``: + +.. code-block:: bash + + cp build-android-arm64-v8a/install/lib/lib*.so android/SherpaOnnx/app/src/main/jniLibs/arm64-v8a/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-arm64-v8a.jpg + :align: center + :alt: Generated shared libraries for arm64-v8a + :width: 600 + +Build for armv7-eabi +^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-onnx # Go to the root repo + ./build-android-armv7-eabi.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + ls -lh build-android-armv7-eabi/install/lib + + -rw-r--r-- 1 fangjun staff 10M Jul 28 13:18 libonnxruntime.so + -rwxr-xr-x 1 fangjun staff 2.1M Jul 28 13:18 libsherpa-onnx-jni.so + +Please copy them to ``android/SherpaOnnx/app/src/main/jniLibs/armeabi-v7a``: + +.. code-block:: bash + + cp build-android-armv7-eabi/install/lib/lib*.so android/SherpaOnnx/app/src/main/jniLibs/armeabi-v7a/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-armv7a-eabi.jpg + :align: center + :alt: Generated shared libraries for armv7-eabi + :width: 600 + +Build for x86_64 +^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-onnx # Go to the root repo + ./build-android-x86-64.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + ls -lh build-android-x86-64/install/lib/ + + -rw-r--r-- 1 fangjun staff 17M Jul 28 13:26 libonnxruntime.so + -rwxr-xr-x 1 fangjun staff 4.0M Jul 28 13:26 libsherpa-onnx-jni.so + +Please copy them to ``android/SherpaOnnx/app/src/main/jniLibs/x86_64/``: + +.. code-block:: bash + + cp build-android-x86-64/install/lib/lib*.so android/SherpaOnnx/app/src/main/jniLibs/x86_64/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-x86-64.jpg + :align: center + :alt: Generated shared libraries for x86_64 + :width: 600 + +Build for x86 +^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-onnx # Go to the root repo + ./build-android-x86.sh + +After building, you will find the following shared libraries: + +.. code-block:: bash + + ls -lh build-android-x86/install/lib/ + + -rw-r--r-- 1 fangjun staff 17M Jul 28 13:28 libonnxruntime.so + -rwxr-xr-x 1 fangjun staff 3.9M Jul 28 13:28 libsherpa-onnx-jni.so + +Please copy them to ``android/SherpaOnnx/app/src/main/jniLibs/x86/``: + +.. code-block:: bash + + cp build-android-x86/install/lib/lib*.so android/SherpaOnnx/app/src/main/jniLibs/x86/ + +You should see the following screen shot after running the above copy ``cp`` command. + +.. figure:: ./pic/so-libs-for-x86.jpg + :align: center + :alt: Generated shared libraries for x86 + :width: 600 + +Download pre-trained models +--------------------------- + +Please read :ref:`sherpa-onnx-pre-trained-models` for all available pre-trained +models. + +In the following, we use a pre-trained model :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20`, +which supports both Chinese and English. + +.. hint:: + + The model is trained using `icefall`_ and the original torchscript model + is from ``_. + +Use the following command to download the pre-trained model and place it into +``android/SherpaOnnx/app/src/main/assets/``: + +.. code-block:: bash + + cd android/SherpaOnnx/app/src/main/assets/ + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + cd sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 + + # Now, remove extra files to reduce the file size of the generated apk + rm -rf test_wavs + rm -f *.sh README.md + rm -f bpe.model + + rm -f encoder-epoch-99-avg-1.int8.onnx + rm -f joiner-epoch-99-avg-1.int8.onnx + rm -f decoder-epoch-99-avg-1.int8.onnx + rm -f bpe.vocab + +In the end, you should have the following files: + +.. code-block:: bash + + ls -lh + + -rw-r--r--@ 1 fangjun staff 13M Jul 28 13:51 decoder-epoch-99-avg-1.onnx + -rw-r--r--@ 1 fangjun staff 315M Jul 28 13:51 encoder-epoch-99-avg-1.onnx + -rw-r--r--@ 1 fangjun staff 12M Jul 28 13:51 joiner-epoch-99-avg-1.onnx + -rw-r--r--@ 1 fangjun staff 55K Nov 21 2023 tokens.txt + +You should see the following screen shot after downloading the pre-trained model: + +.. figure:: ./pic/pre-trained-model-2023-02-20.png + :alt: Files after downloading the pre-trained model + :align: center + :width: 600 + +.. hint:: + + If you select a different pre-trained model, make sure that you also change the + corresponding code listed in the following screen shot: + + .. figure:: ./pic/type-for-pre-trained-model-2023-02-20.png + :alt: Change code if you select a different model + :width: 600 + +Generate APK +------------ + +Finally, it is time to build `sherpa-onnx`_ to generate an APK package. + +Select ``Build -> Make Project``, as shown in the following screen shot. + +.. figure:: ./pic/build-make-project.png + :align: center + :alt: Select ``Build -> Make Project`` + :width: 600 + +You can find the generated APK in ``android/SherpaOnnx/app/build/outputs/apk/debug/app-debug.apk``: + +.. code-block:: bash + + ls -lh android/SherpaOnnx/app/build/outputs/apk/debug/app-debug.apk + + -rw-r--r--@ 1 fangjun staff 329M Jul 28 13:56 android/SherpaOnnx/app/build/outputs/apk/debug/app-debug.apk + +Congratulations! You have successfully built an APK for Android. + +Read below to learn more. + +Analyze the APK +--------------- + +.. figure:: ./pic/analyze-apk.png + :align: center + :alt: Select ``Build -> Analyze APK ...`` + :width: 600 + +Select ``Build -> Analyze APK ...`` in the above screen shot, in the +popped-up dialog select the generated APK ``app-debug.apk``, +and you will see the following screen shot: + +.. figure:: ./pic/analyze-apk-result.jpg + :align: center + :alt: Result of analyzing apk + :width: 700 + +You can see from the above screen shot that most part of the APK +is occupied by the pre-trained model, while the runtime, including the shared +libraries, is only ``7.2 MB``. + +.. caution:: + + You can see that ``libonnxruntime.so`` alone occupies ``5.8MB`` out of ``7.2MB``. + + We use a so-called ``Full build`` instead of ``Mobile build``, so the file + size of the library is somewhat a bit larger. + + ``libonnxruntime.so`` is donwloaded from + + ``_ + + Please refer to ``_ for a + custom build to reduce the file size of ``libonnxruntime.so``. + + Note that we are constantly updating the version of ``onnxruntime``. By + the time you are reading this section, we may be using the latest version + of ``onnxruntime``. + +.. hint:: + + We recommend you to use `sherpa-ncnn`_. Please see + :ref:`sherpa-ncnn-analyze-apk-result` for `sherpa-ncnn`_. The total runtime of + `sherpa-ncnn`_ is only ``1.6 MB``, which is much smaller than `sherpa-onnx`_. diff --git a/docs/source/onnx/android/index.rst b/docs/source/onnx/android/index.rst new file mode 100644 index 000000000..9cdd9b1ff --- /dev/null +++ b/docs/source/onnx/android/index.rst @@ -0,0 +1,18 @@ +.. _sherpa-onnx-android: + +Android +======= + + +In this section, we describe how to build an Android app with `sherpa-onnx`_. + +.. hint:: + + For real-time speech recognition, it does not need to access the Internet. + Everyting is processed locally on your phone. + +.. toctree:: + :maxdepth: 2 + + ./prebuilt-apk.rst + build-sherpa-onnx diff --git a/docs/source/onnx/android/pic/analyze-apk-result.jpg b/docs/source/onnx/android/pic/analyze-apk-result.jpg new file mode 100644 index 000000000..d3be13109 Binary files /dev/null and b/docs/source/onnx/android/pic/analyze-apk-result.jpg differ diff --git a/docs/source/onnx/android/pic/analyze-apk.png b/docs/source/onnx/android/pic/analyze-apk.png new file mode 100644 index 000000000..681ed52fd Binary files /dev/null and b/docs/source/onnx/android/pic/analyze-apk.png differ diff --git a/docs/source/onnx/android/pic/android-studio-version.png b/docs/source/onnx/android/pic/android-studio-version.png new file mode 100644 index 000000000..e682e4754 Binary files /dev/null and b/docs/source/onnx/android/pic/android-studio-version.png differ diff --git a/docs/source/onnx/android/pic/build-make-project.png b/docs/source/onnx/android/pic/build-make-project.png new file mode 100644 index 000000000..4072e0f2b Binary files /dev/null and b/docs/source/onnx/android/pic/build-make-project.png differ diff --git a/docs/source/onnx/android/pic/ndk-tools.png b/docs/source/onnx/android/pic/ndk-tools.png new file mode 120000 index 000000000..d6ac3961f --- /dev/null +++ b/docs/source/onnx/android/pic/ndk-tools.png @@ -0,0 +1 @@ +../../../ncnn/android/pic/ndk-tools.png \ No newline at end of file diff --git a/docs/source/onnx/android/pic/open-sherpa-onnx.png b/docs/source/onnx/android/pic/open-sherpa-onnx.png new file mode 100644 index 000000000..881208e15 Binary files /dev/null and b/docs/source/onnx/android/pic/open-sherpa-onnx.png differ diff --git a/docs/source/onnx/android/pic/pre-trained-model-2023-02-20.png b/docs/source/onnx/android/pic/pre-trained-model-2023-02-20.png new file mode 100644 index 000000000..b17c79497 Binary files /dev/null and b/docs/source/onnx/android/pic/pre-trained-model-2023-02-20.png differ diff --git a/docs/source/onnx/android/pic/select-sdk-manager.png b/docs/source/onnx/android/pic/select-sdk-manager.png new file mode 100644 index 000000000..0707b53f8 Binary files /dev/null and b/docs/source/onnx/android/pic/select-sdk-manager.png differ diff --git a/docs/source/onnx/android/pic/so-libs-for-arm64-v8a.jpg b/docs/source/onnx/android/pic/so-libs-for-arm64-v8a.jpg new file mode 100644 index 000000000..fb34f0c98 Binary files /dev/null and b/docs/source/onnx/android/pic/so-libs-for-arm64-v8a.jpg differ diff --git a/docs/source/onnx/android/pic/so-libs-for-armv7a-eabi.jpg b/docs/source/onnx/android/pic/so-libs-for-armv7a-eabi.jpg new file mode 100644 index 000000000..a82bf1264 Binary files /dev/null and b/docs/source/onnx/android/pic/so-libs-for-armv7a-eabi.jpg differ diff --git a/docs/source/onnx/android/pic/so-libs-for-x86-64.jpg b/docs/source/onnx/android/pic/so-libs-for-x86-64.jpg new file mode 100644 index 000000000..9600e4da4 Binary files /dev/null and b/docs/source/onnx/android/pic/so-libs-for-x86-64.jpg differ diff --git a/docs/source/onnx/android/pic/so-libs-for-x86.jpg b/docs/source/onnx/android/pic/so-libs-for-x86.jpg new file mode 100644 index 000000000..bd34203cc Binary files /dev/null and b/docs/source/onnx/android/pic/so-libs-for-x86.jpg differ diff --git a/docs/source/onnx/android/pic/start-android-studio.png b/docs/source/onnx/android/pic/start-android-studio.png new file mode 100644 index 000000000..1a42e13e8 Binary files /dev/null and b/docs/source/onnx/android/pic/start-android-studio.png differ diff --git a/docs/source/onnx/android/pic/type-for-pre-trained-model-2023-02-20.png b/docs/source/onnx/android/pic/type-for-pre-trained-model-2023-02-20.png new file mode 100644 index 000000000..514e86146 Binary files /dev/null and b/docs/source/onnx/android/pic/type-for-pre-trained-model-2023-02-20.png differ diff --git a/docs/source/onnx/android/prebuilt-apk.rst b/docs/source/onnx/android/prebuilt-apk.rst new file mode 100644 index 000000000..df03d3f6e --- /dev/null +++ b/docs/source/onnx/android/prebuilt-apk.rst @@ -0,0 +1,47 @@ +Pre-built APKs +============== + +Links for pre-built APKs can be found in the following table: + +.. hint:: + + It runs locally, without internet connection. + +.. list-table:: + + * - **** + - 中国用户 + - URL + * - Streaming speech recognition + - `点这里 `_ + - ``_ + * - Text-to-speech engine + - `点这里 `_ + - ``_ + * - Text-to-speech + - `点这里 `_ + - ``_ + * - Voice activity detection (VAD) + - `点这里 `_ + - ``_ + * - VAD + non-streaming speech recognition + - `点这里 `_ + - ``_ + * - Two-pass speech recognition + - `点这里 `_ + - ``_ + * - Audio tagging + - `点这里 `_ + - ``_ + * - Audio tagging (WearOS) + - `点这里 `_ + - ``_ + * - Speaker identification + - `点这里 `_ + - ``_ + * - Spoken language identification + - `点这里 `_ + - ``_ + * - Keyword spotting + - `点这里 `_ + - ``_ diff --git a/docs/source/onnx/audio-tagging/android.rst b/docs/source/onnx/audio-tagging/android.rst new file mode 100644 index 000000000..8f72e7346 --- /dev/null +++ b/docs/source/onnx/audio-tagging/android.rst @@ -0,0 +1,13 @@ +.. _audio-tagging-android: + +Android +======= + +You can find Android APKs for each model at the following page + + ``_ + +Please follow :ref:`sherpa-onnx-android` to build Android APKs from source. + +If you want to run audio tagging on your WearOS watches, please see +:ref:`audio-tagging-wearos`. diff --git a/docs/source/onnx/audio-tagging/index.rst b/docs/source/onnx/audio-tagging/index.rst new file mode 100644 index 000000000..f0eadbc34 --- /dev/null +++ b/docs/source/onnx/audio-tagging/index.rst @@ -0,0 +1,13 @@ +Audio tagging +============= + +This section introduces the models that `sherpa-onnx`_ supports for audio +tagging, which aims to recognize sound events within an audio clip without +its temporal localization. + +.. toctree:: + :maxdepth: 5 + + ./pretrained_models.rst + ./android.rst + ./wearos.rst diff --git a/docs/source/onnx/audio-tagging/pretrained_models.rst b/docs/source/onnx/audio-tagging/pretrained_models.rst new file mode 100644 index 000000000..56471f202 --- /dev/null +++ b/docs/source/onnx/audio-tagging/pretrained_models.rst @@ -0,0 +1,571 @@ +Pre-trained models +================== + +This section lists pre-trained models for audio tagging. + +You can find all models at the following URL: + + ``_ + +sherpa-onnx-zipformer-small-audio-tagging-2024-04-15 +---------------------------------------------------- + +This model is trained by ``_ +using the dataset `audioset`_. + +In the following, we describe how to download and use it with `sherpa-onnx`_. + +Download the model +^^^^^^^^^^^^^^^^^^ + +Please use the following commands to download it:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 + + tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 + rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2 + +You will find the following files after unzipping:: + + -rw-r--r-- 1 fangjun staff 243B Apr 15 16:14 README.md + -rw-r--r-- 1 fangjun staff 14K Apr 15 16:14 class_labels_indices.csv + -rw-r--r-- 1 fangjun staff 26M Apr 15 16:14 model.int8.onnx + -rw-r--r-- 1 fangjun staff 88M Apr 15 16:14 model.onnx + drwxr-xr-x 15 fangjun staff 480B Apr 15 16:14 test_wavs + +C++ binary examples +^^^^^^^^^^^^^^^^^^^ + +.. hint:: + + You can find the binary executable file ``sherpa-onnx-offline-audio-tagging`` + after installing `sherpa-onnx`_ either from source or using ``pip install sherpa-onnx``_. + +Cat +::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
1.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/1.wav + +prints the following:: + + 0: AudioEvent(name="Animal", index=72, prob=0.947886) + 1: AudioEvent(name="Cat", index=81, prob=0.938876) + 2: AudioEvent(name="Domestic animals, pets", index=73, prob=0.931975) + 3: AudioEvent(name="Caterwaul", index=85, prob=0.178876) + 4: AudioEvent(name="Meow", index=83, prob=0.176177) + Num threads: 1 + Wave duration: 10.000 + Elapsed seconds: 0.297 s + Real time factor (RTF): 0.297 / 10.000 = 0.030 + +.. hint:: + + By default, it outputs the top 5 events. The first event has the + largest probability. + +Whistle +::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
2.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/2.wav + +prints the following:: + + 0: AudioEvent(name="Whistling", index=40, prob=0.804928) + 1: AudioEvent(name="Music", index=137, prob=0.27548) + 2: AudioEvent(name="Piano", index=153, prob=0.135418) + 3: AudioEvent(name="Keyboard (musical)", index=152, prob=0.0580414) + 4: AudioEvent(name="Musical instrument", index=138, prob=0.0400399) + Num threads: 1 + Wave duration: 10.000 + Elapsed seconds: 0.289 s + Real time factor (RTF): 0.289 / 10.000 = 0.029 + +Music +::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
3.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/3.wav + +prints the following:: + + 0: AudioEvent(name="Music", index=137, prob=0.79673) + 1: AudioEvent(name="A capella", index=255, prob=0.765521) + 2: AudioEvent(name="Singing", index=27, prob=0.473899) + 3: AudioEvent(name="Vocal music", index=254, prob=0.459337) + 4: AudioEvent(name="Choir", index=28, prob=0.458174) + Num threads: 1 + Wave duration: 10.000 + Elapsed seconds: 0.279 s + Real time factor (RTF): 0.279 / 10.000 = 0.028 + +Laughter +:::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
4.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/4.wav + +prints the following:: + + 0: AudioEvent(name="Laughter", index=16, prob=0.929239) + 1: AudioEvent(name="Snicker", index=19, prob=0.321969) + 2: AudioEvent(name="Giggle", index=18, prob=0.149667) + 3: AudioEvent(name="Inside, small room", index=506, prob=0.119332) + 4: AudioEvent(name="Belly laugh", index=20, prob=0.100728) + Num threads: 1 + Wave duration: 10.000 + Elapsed seconds: 0.314 s + Real time factor (RTF): 0.314 / 10.000 = 0.031 + +Finger snapping +::::::::::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
5.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/5.wav + +prints the following:: + + 0: AudioEvent(name="Finger snapping", index=62, prob=0.690543) + 1: AudioEvent(name="Slap, smack", index=467, prob=0.452133) + 2: AudioEvent(name="Clapping", index=63, prob=0.179213) + 3: AudioEvent(name="Sound effect", index=504, prob=0.101151) + 4: AudioEvent(name="Whack, thwack", index=468, prob=0.0294559) + Num threads: 1 + Wave duration: 8.284 + Elapsed seconds: 0.225 s + Real time factor (RTF): 0.225 / 8.284 = 0.027 + +Baby cry +:::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
6.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/6.wav + +prints the following:: + + 0: AudioEvent(name="Baby cry, infant cry", index=23, prob=0.912273) + 1: AudioEvent(name="Crying, sobbing", index=22, prob=0.670927) + 2: AudioEvent(name="Whimper", index=24, prob=0.187221) + 3: AudioEvent(name="Inside, small room", index=506, prob=0.0314955) + 4: AudioEvent(name="Sound effect", index=504, prob=0.0118726) + Num threads: 1 + Wave duration: 8.719 + Elapsed seconds: 0.232 s + Real time factor (RTF): 0.232 / 8.719 = 0.027 + +Smoke alarm +::::::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
7.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/7.wav + +prints the following:: + + 0: AudioEvent(name="Smoke detector, smoke alarm", index=399, prob=0.781478) + 1: AudioEvent(name="Beep, bleep", index=481, prob=0.641056) + 2: AudioEvent(name="Buzzer", index=398, prob=0.218576) + 3: AudioEvent(name="Fire alarm", index=400, prob=0.140145) + 4: AudioEvent(name="Alarm", index=388, prob=0.012525) + Num threads: 1 + Wave duration: 2.819 + Elapsed seconds: 0.080 s + Real time factor (RTF): 0.080 / 2.819 = 0.028 + +Siren +::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
8.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/8.wav + +prints the following:: + + 0: AudioEvent(name="Siren", index=396, prob=0.877108) + 1: AudioEvent(name="Civil defense siren", index=397, prob=0.732789) + 2: AudioEvent(name="Vehicle", index=300, prob=0.0113797) + 3: AudioEvent(name="Inside, small room", index=506, prob=0.00537381) + 4: AudioEvent(name="Outside, urban or manmade", index=509, prob=0.00261939) + Num threads: 1 + Wave duration: 7.721 + Elapsed seconds: 0.220 s + Real time factor (RTF): 0.220 / 7.721 = 0.028 + +Stream water +:::::::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
10.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/10.wav + +prints the following:: + + 0: AudioEvent(name="Stream", index=292, prob=0.247785) + 1: AudioEvent(name="Water", index=288, prob=0.231587) + 2: AudioEvent(name="Gurgling", index=297, prob=0.170981) + 3: AudioEvent(name="Trickle, dribble", index=450, prob=0.108859) + 4: AudioEvent(name="Liquid", index=444, prob=0.0693812) + Num threads: 1 + Wave duration: 7.837 + Elapsed seconds: 0.212 s + Real time factor (RTF): 0.212 / 7.837 = 0.027 + +Meow +:::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
11.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/11.wav + +prints the following:: + + 0: AudioEvent(name="Meow", index=83, prob=0.814944) + 1: AudioEvent(name="Cat", index=81, prob=0.698858) + 2: AudioEvent(name="Domestic animals, pets", index=73, prob=0.564516) + 3: AudioEvent(name="Animal", index=72, prob=0.535303) + 4: AudioEvent(name="Music", index=137, prob=0.105332) + Num threads: 1 + Wave duration: 11.483 + Elapsed seconds: 0.361 s + Real time factor (RTF): 0.361 / 11.483 = 0.031 + +Dog bark +:::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
12.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/12.wav + +prints the following:: + + 0: AudioEvent(name="Animal", index=72, prob=0.688237) + 1: AudioEvent(name="Dog", index=74, prob=0.637803) + 2: AudioEvent(name="Bark", index=75, prob=0.608597) + 3: AudioEvent(name="Bow-wow", index=78, prob=0.515501) + 4: AudioEvent(name="Domestic animals, pets", index=73, prob=0.495074) + Num threads: 1 + Wave duration: 8.974 + Elapsed seconds: 0.261 s + Real time factor (RTF): 0.261 / 8.974 = 0.029 + +Oink (pig) +:::::::::: + +For the following test wave, + +.. raw:: html + + + + + + + + + + +
Wave filenameWave
13.wav + +
+ +the command:: + + ./bin/sherpa-onnx-offline-audio-tagging \ + --zipformer-model=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx \ + --labels=./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv \ + ./sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/13.wav + +prints the following:: + + 0: AudioEvent(name="Oink", index=94, prob=0.888416) + 1: AudioEvent(name="Pig", index=93, prob=0.164295) + 2: AudioEvent(name="Animal", index=72, prob=0.160802) + 3: AudioEvent(name="Speech", index=0, prob=0.0276513) + 4: AudioEvent(name="Snort", index=46, prob=0.0201952) + Num threads: 1 + Wave duration: 9.067 + Elapsed seconds: 0.261 s + Real time factor (RTF): 0.261 / 9.067 = 0.029 + +Python API examples +^^^^^^^^^^^^^^^^^^^ + +Please see + + ``_ + +Huggingface space +^^^^^^^^^^^^^^^^^ + +You can try audio tagging with `sherpa-onnx`_ from within you browser by visiting the following URL: + + ``_ + +.. note:: + + For Chinese users, please use + + ``_ diff --git a/docs/source/onnx/audio-tagging/wearos.rst b/docs/source/onnx/audio-tagging/wearos.rst new file mode 100644 index 000000000..690adeb7f --- /dev/null +++ b/docs/source/onnx/audio-tagging/wearos.rst @@ -0,0 +1,13 @@ +.. _audio-tagging-wearos: + +WearOS +====== + +You can find APKs for WearOS of each model at the following page + + ``_ + +Please follow :ref:`sherpa-onnx-android` to build APKs for WearOS from source. + +If you want to run audio tagging on your Android phones, please see +:ref:`audio-tagging-android`. diff --git a/docs/source/onnx/c-api/index.rst b/docs/source/onnx/c-api/index.rst new file mode 100644 index 000000000..553681120 --- /dev/null +++ b/docs/source/onnx/c-api/index.rst @@ -0,0 +1,315 @@ +.. _sherpa-onnx-c-api: + +C API +===== + +In this section, we describe how to use the C API of `sherpa-onnx`_. + + +Specifically, we will describe: + + - How to generate required files + - How to use ``pkg-config`` with `sherpa-onnx`_ + +You can find the implementation at + + - ``_ + - ``_ + +Generate required files +----------------------- + +Before using the C API of `sherpa-onnx`_, we need to first build required +libraries. You can choose either to build static libraries or shared libraries. + +Build shared libraries +^^^^^^^^^^^^^^^^^^^^^^ + +Assume that we want to put library files and header files in the directory +``/tmp/sherpa-onnx/shared``: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build-shared + cd build-shared + + cmake \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_INSTALL_PREFIX=/tmp/sherpa-onnx/shared \ + .. + + make -j6 + make install + +You should find the following files inside ``/tmp/sherpa-onnx/shared``: + +.. tabs:: + + .. tab:: macOS + + .. code-block:: bash + + $ tree /tmp/sherpa-onnx/shared/ + + /tmp/sherpa-onnx/shared + ├── bin + │   ├── sherpa-onnx + │   ├── sherpa-onnx-keyword-spotter + │   ├── sherpa-onnx-keyword-spotter-microphone + │   ├── sherpa-onnx-microphone + │   ├── sherpa-onnx-microphone-offline + │   ├── sherpa-onnx-microphone-offline-audio-tagging + │   ├── sherpa-onnx-microphone-offline-speaker-identification + │   ├── sherpa-onnx-offline + │   ├── sherpa-onnx-offline-audio-tagging + │   ├── sherpa-onnx-offline-language-identification + │   ├── sherpa-onnx-offline-parallel + │   ├── sherpa-onnx-offline-punctuation + │   ├── sherpa-onnx-offline-tts + │   ├── sherpa-onnx-offline-tts-play + │   ├── sherpa-onnx-offline-websocket-server + │   ├── sherpa-onnx-online-punctuation + │   ├── sherpa-onnx-online-websocket-client + │   ├── sherpa-onnx-online-websocket-server + │   ├── sherpa-onnx-vad-microphone + │   └── sherpa-onnx-vad-microphone-offline-asr + ├── include + │   └── sherpa-onnx + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libonnxruntime.1.17.1.dylib + │   ├── libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + │   └── libsherpa-onnx-c-api.dylib + └── sherpa-onnx.pc + + 5 directories, 25 files + + .. tab:: Linux + + .. code-block:: bash + + $ tree /tmp/sherpa-onnx/shared/ + + /tmp/sherpa-onnx/shared + ├── bin + │   ├── sherpa-onnx + │   ├── sherpa-onnx-alsa + │   ├── sherpa-onnx-alsa-offline + │   ├── sherpa-onnx-alsa-offline-audio-tagging + │   ├── sherpa-onnx-alsa-offline-speaker-identification + │   ├── sherpa-onnx-keyword-spotter + │   ├── sherpa-onnx-keyword-spotter-alsa + │   ├── sherpa-onnx-offline + │   ├── sherpa-onnx-offline-audio-tagging + │   ├── sherpa-onnx-offline-language-identification + │   ├── sherpa-onnx-offline-parallel + │   ├── sherpa-onnx-offline-punctuation + │   ├── sherpa-onnx-offline-tts + │   ├── sherpa-onnx-offline-tts-play-alsa + │   ├── sherpa-onnx-offline-websocket-server + │   ├── sherpa-onnx-online-punctuation + │   ├── sherpa-onnx-online-websocket-client + │   ├── sherpa-onnx-online-websocket-server + │   └── sherpa-onnx-vad-alsa + ├── include + │   └── sherpa-onnx + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libonnxruntime.so + │   └── libsherpa-onnx-c-api.so + └── sherpa-onnx.pc + + 6 directories, 23 files + + +Build static libraries +^^^^^^^^^^^^^^^^^^^^^^ + +Assume that we want to put library files and header files in the directory +``/tmp/sherpa-onnx/static``: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build-static + cd build-static + + cmake \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_INSTALL_PREFIX=/tmp/sherpa-onnx/static \ + .. + + make -j6 + make install + +You should find the following files in ``/tmp/sherpa-onnx/static``: + +.. tabs:: + + .. tab:: macOS + + .. code-block:: bash + + $ tree /tmp/sherpa-onnx/static/ + + /tmp/sherpa-onnx/static + ├── bin + │   ├── sherpa-onnx + │   ├── sherpa-onnx-keyword-spotter + │   ├── sherpa-onnx-keyword-spotter-microphone + │   ├── sherpa-onnx-microphone + │   ├── sherpa-onnx-microphone-offline + │   ├── sherpa-onnx-microphone-offline-audio-tagging + │   ├── sherpa-onnx-microphone-offline-speaker-identification + │   ├── sherpa-onnx-offline + │   ├── sherpa-onnx-offline-audio-tagging + │   ├── sherpa-onnx-offline-language-identification + │   ├── sherpa-onnx-offline-parallel + │   ├── sherpa-onnx-offline-punctuation + │   ├── sherpa-onnx-offline-tts + │   ├── sherpa-onnx-offline-tts-play + │   ├── sherpa-onnx-offline-websocket-server + │   ├── sherpa-onnx-online-punctuation + │   ├── sherpa-onnx-online-websocket-client + │   ├── sherpa-onnx-online-websocket-server + │   ├── sherpa-onnx-vad-microphone + │   └── sherpa-onnx-vad-microphone-offline-asr + ├── include + │   └── sherpa-onnx + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libespeak-ng.a + │   ├── libkaldi-decoder-core.a + │   ├── libkaldi-native-fbank-core.a + │   ├── libonnxruntime.a + │   ├── libpiper_phonemize.a + │   ├── libsherpa-onnx-c-api.a + │   ├── libsherpa-onnx-core.a + │   ├── libsherpa-onnx-fst.a + │   ├── libsherpa-onnx-fstfar.a + │   ├── libsherpa-onnx-kaldifst-core.a + │   ├── libsherpa-onnx-portaudio_static.a + │   ├── libssentencepiece_core.a + │   └── libucd.a + └── sherpa-onnx.pc + + 5 directories, 35 files + + .. tab:: Linux + + .. code-block:: bash + + $ tree /tmp/sherpa-onnx/static/ + + /tmp/sherpa-onnx/static + ├── bin + │   ├── sherpa-onnx + │   ├── sherpa-onnx-alsa + │   ├── sherpa-onnx-alsa-offline + │   ├── sherpa-onnx-alsa-offline-audio-tagging + │   ├── sherpa-onnx-alsa-offline-speaker-identification + │   ├── sherpa-onnx-keyword-spotter + │   ├── sherpa-onnx-keyword-spotter-alsa + │   ├── sherpa-onnx-keyword-spotter-microphone + │   ├── sherpa-onnx-microphone + │   ├── sherpa-onnx-microphone-offline + │   ├── sherpa-onnx-microphone-offline-audio-tagging + │   ├── sherpa-onnx-microphone-offline-speaker-identification + │   ├── sherpa-onnx-offline + │   ├── sherpa-onnx-offline-audio-tagging + │   ├── sherpa-onnx-offline-language-identification + │   ├── sherpa-onnx-offline-parallel + │   ├── sherpa-onnx-offline-punctuation + │   ├── sherpa-onnx-offline-tts + │   ├── sherpa-onnx-offline-tts-play + │   ├── sherpa-onnx-offline-tts-play-alsa + │   ├── sherpa-onnx-offline-websocket-server + │   ├── sherpa-onnx-online-punctuation + │   ├── sherpa-onnx-online-websocket-client + │   ├── sherpa-onnx-online-websocket-server + │   ├── sherpa-onnx-vad-alsa + │   ├── sherpa-onnx-vad-microphone + │   └── sherpa-onnx-vad-microphone-offline-asr + ├── include + │   └── sherpa-onnx + │   └── c-api + │   └── c-api.h + ├── lib + │   ├── libespeak-ng.a + │   ├── libkaldi-decoder-core.a + │   ├── libkaldi-native-fbank-core.a + │   ├── libonnxruntime.a + │   ├── libpiper_phonemize.a + │   ├── libsherpa-onnx-c-api.a + │   ├── libsherpa-onnx-core.a + │   ├── libsherpa-onnx-fst.a + │   ├── libsherpa-onnx-fstfar.a + │   ├── libsherpa-onnx-kaldifst-core.a + │   ├── libsherpa-onnx-portaudio_static.a + │   ├── libssentencepiece_core.a + │   └── libucd.a + └── sherpa-onnx.pc + + 6 directories, 42 files + + +Build decode-file-c-api.c with generated files +---------------------------------------------- + +To build the following file: + + ``_ + +We can use: + +.. tabs:: + + .. tab:: static link + + .. code-block:: bash + + export PKG_CONFIG_PATH=/tmp/sherpa-onnx/static:$PKG_CONFIG_PATH + + cd ./c-api-examples + gcc -o decode-file-c-api $(pkg-config --cflags sherpa-onnx) ./decode-file-c-api.c $(pkg-config --libs sherpa-onnx) + + ./decode-file-c-api --help + + .. tab:: dynamic link + + .. code-block:: bash + + export PKG_CONFIG_PATH=/tmp/sherpa-onnx/shared:$PKG_CONFIG_PATH + + cd ./c-api-examples + gcc -o decode-file-c-api $(pkg-config --cflags sherpa-onnx) ./decode-file-c-api.c $(pkg-config --libs sherpa-onnx) + + ./decode-file-c-api --help + +.. warning:: + + The order of linking the libraries matters. Please see + + - Static link without TTS: ``_ + - Static link with TTS: ``_ + - Dynamic link: ``_ + +colab +----- + +We provide a colab notebook +|Sherpa-onnx c api example colab notebook| +for you to try the C API of `sherpa-onnx`_. + +.. |Sherpa-onnx c api example colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_c_api_example.ipynb diff --git a/docs/source/onnx/csharp-api/index.rst b/docs/source/onnx/csharp-api/index.rst new file mode 100644 index 000000000..5af764a30 --- /dev/null +++ b/docs/source/onnx/csharp-api/index.rst @@ -0,0 +1,535 @@ +.. _sherpa-onnx-csharp-api: + +C# API +====== + +In this section, we describe how to use the ``C#`` +API examples of `sherpa-onnx`_. + +The ``C#`` API of `sherpa-onnx`_ supports both streaming and non-streaming speech recognition. + +The following table lists some ``C#`` API examples: + +.. list-table:: + + * - Description + - URL + * - Decode a file with **non-streaming** models + - ``_ + * - Decode a file with **streaming** models + - ``_ + * - **Real-time** speech recognition from a ``microphone`` + - ``_ + +You can find the implementation in the following files: + + - API for **streaming** speech recognition + + ``_ + + - API for **non-streaming** speech recognition + + ``_ + +We also provide a nuget package for `sherpa-onnx`_: + + ``_ + +You can use the following statement in your ``csproj`` file to introduce +the dependency on `sherpa-onnx`_: + +.. code-block:: bash + + + +One thing to note is that we have provided pre-built libraries for ``C#`` so that you don't need +to build `sherpa-onnx`_ by yourself when using the ``C#`` API. + +In the following, we describe how to run our provided ``C#`` API examples. + +.. note:: + + Before you continue, please make sure you have installed `.Net `_. + If not, please follow ``_ to install ``.Net``. + +.. hint:: + + ``.Net`` supports Windows, macOS, and Linux. + +Decode files with non-streaming models +-------------------------------------- + +First, let us build the example: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx/dotnet-examples/offline-decode-files/ + dotnet build -c Release + ./bin/Release/net6.0/offline-decode-files --help + +You will find the following output: + +.. code-block:: bash + + # Zipformer + + dotnet run \ + --tokens=./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.onnx \ + --files ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + + Please refer to + https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html + to download pre-trained non-streaming zipformer models. + + # Paraformer + + dotnet run \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx \ + --files ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + Please refer to + https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html + to download pre-trained paraformer models + + # NeMo CTC + + dotnet run \ + --tokens=./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt \ + --nemo-ctc=./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \ + --num-threads=1 \ + --files ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + + Please refer to + https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html + to download pre-trained paraformer models + + Copyright (c) 2023 Xiaomi Corporation + + --tokens Path to tokens.txt + --encoder Path to encoder.onnx. Used only for transducer models + --decoder Path to decoder.onnx. Used only for transducer models + --joiner Path to joiner.onnx. Used only for transducer models + --paraformer Path to model.onnx. Used only for paraformer models + --nemo-ctc Path to model.onnx. Used only for NeMo CTC models + --num-threads (Default: 1) Number of threads for computation + --decoding-method (Default: greedy_search) Valid decoding methods are: + greedy_search, modified_beam_search + --max-active-paths (Default: 4) Used only when --decoding--method is + modified_beam_search. + It specifies number of active paths to keep during the + search + --files Required. Audio files for decoding + --help Display this help screen. + --version Display version information. + +Now let us refer to :ref:`sherpa-onnx-pre-trained-models` to download a non-streaming model. + +We give several examples below for demonstration. + +Non-streaming transducer +^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa-onnx-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/dotnet-examples/offline-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + dotnet run -c Release \ + --encoder ./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder ./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner ./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx \ + --tokens ./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt \ + --files ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +It should give you the following output: + +.. code-block:: bash + + /Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:117 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + + -------------------- + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + -------------------- + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + -------------------- + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + -------------------- + +Non-streaming paraformer +^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa_onnx_offline_paraformer_zh_2023_03_28_chinese` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/dotnet-examples/offline-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +Now we can use: + +.. code-block:: bash + + dotnet run -c Release \ + --paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --files ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +It should give you the following output: + +.. code-block:: bash + + /Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:117 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + + -------------------- + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav + 对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你 + -------------------- + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav + 重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现 + -------------------- + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + 甚至出现交易几乎停滞的情况 + -------------------- + +Non-streaming CTC model from NeMo +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`stt-en-conformer-ctc-medium-nemo-sherpa-onnx` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/dotnet-examples/offline-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + tar xvf sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + +Now we can use: + +.. code-block:: bash + + dotnet run -c Release \ + --nemo-ctc ./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \ + --tokens ./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt \ + --files ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + +It should give you the following output: + +.. code-block:: bash + + /Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:117 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + + -------------------- + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels + -------------------- + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonored bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven + -------------------- + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + yet these thoughts affected hester pryne less with hope than apprehension + -------------------- + +Decode files with streaming models +---------------------------------- + +First, let us build the example: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx/dotnet-examples/online-decode-files + dotnet build -c Release + ./bin/Release/net6.0/online-decode-files --help + +You will find the following output: + +.. code-block:: bash + + dotnet run \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ + --num-threads=2 \ + --decoding-method=modified_beam_search \ + --debug=false \ + --files ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav + + Please refer to + https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html + to download pre-trained streaming models. + + Copyright (c) 2023 Xiaomi Corporation + + --tokens Required. Path to tokens.txt + --provider (Default: cpu) Provider, e.g., cpu, coreml + --encoder Required. Path to encoder.onnx + --decoder Required. Path to decoder.onnx + --joiner Required. Path to joiner.onnx + --num-threads (Default: 1) Number of threads for computation + --decoding-method (Default: greedy_search) Valid decoding + methods are: greedy_search, + modified_beam_search + --debug (Default: false) True to show model info + during loading + --sample-rate (Default: 16000) Sample rate of the data used + to train the model + --max-active-paths (Default: 4) Used only when --decoding--method + is modified_beam_search. + It specifies number of active paths to keep + during the search + --enable-endpoint (Default: false) True to enable endpoint + detection. + --rule1-min-trailing-silence (Default: 2.4) An endpoint is detected if + trailing silence in seconds is + larger than this value even if nothing has + been decoded. Used only when --enable-endpoint + is true. + --rule2-min-trailing-silence (Default: 1.2) An endpoint is detected if + trailing silence in seconds is + larger than this value after something that is + not blank has been decoded. Used + only when --enable-endpoint is true. + --rule3-min-utterance-length (Default: 20) An endpoint is detected if the + utterance in seconds is + larger than this value. Used only when + --enable-endpoint is true. + --files Required. Audio files for decoding + --help Display this help screen. + --version Display version information. + +Now let us refer to :ref:`sherpa-onnx-pre-trained-models` to download a streaming model. + +We give one example below for demonstration. + +Streaming transducer +^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa-onnx-streaming-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/dotnet-examples/online-decode-files/ + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + dotnet run -c Release \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --files ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/8k.wav + +You will find the following output: + +.. code-block:: bash + + /Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/features.cc:AcceptWaveform:76 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + + -------------------- + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + -------------------- + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + -------------------- + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + -------------------- + +Real-time speech recognition from microphone +-------------------------------------------- + +First, let us build the example: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx/dotnet-examples/speech-recognition-from-microphone + dotnet build -c Release + ./bin/Release/net6.0/speech-recognition-from-microphone --help + +You will find the following output: + +.. code-block:: bash + + dotnet run -c Release \ + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ + + Please refer to + https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html + to download pre-trained streaming models. + + Copyright (c) 2023 Xiaomi Corporation + + --tokens Required. Path to tokens.txt + --provider (Default: cpu) Provider, e.g., cpu, coreml + --encoder Required. Path to encoder.onnx + --decoder Required. Path to decoder.onnx + --joiner Required. Path to joiner.onnx + --num-threads (Default: 1) Number of threads for computation + --decoding-method (Default: greedy_search) Valid decoding + methods are: greedy_search, + modified_beam_search + --debug (Default: false) True to show model info + during loading + --sample-rate (Default: 16000) Sample rate of the data used + to train the model + --max-active-paths (Default: 4) Used only when --decoding--method + is modified_beam_search. + It specifies number of active paths to keep + during the search + --enable-endpoint (Default: true) True to enable endpoint + detection. + --rule1-min-trailing-silence (Default: 2.4) An endpoint is detected if + trailing silence in seconds is + larger than this value even if nothing has + been decoded. Used only when --enable-endpoint + is true. + --rule2-min-trailing-silence (Default: 0.8) An endpoint is detected if + trailing silence in seconds is + larger than this value after something that is + not blank has been decoded. Used + only when --enable-endpoint is true. + --rule3-min-utterance-length (Default: 20) An endpoint is detected if the + utterance in seconds is + larger than this value. Used only when + --enable-endpoint is true. + --help Display this help screen. + --version Display version information. + +Now let us refer to :ref:`sherpa-onnx-pre-trained-models` to download a streaming model. + +We give one example below for demonstration. + +Streaming transducer +^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa-onnx-streaming-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/dotnet-examples/speech-recognition-from-microphone + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + dotnet run -c Release \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt + +You will find the following output: + +.. code-block:: bash + + PortAudio V19.7.0-devel, revision 147dd722548358763a8b649b3e4b41dfffbcfbb6 + Number of devices: 5 + Device 0 + Name: Background Music + Max input channels: 2 + Default sample rate: 44100 + Device 1 + Name: Background Music (UI Sounds) + Max input channels: 2 + Default sample rate: 44100 + Device 2 + Name: MacBook Pro Microphone + Max input channels: 1 + Default sample rate: 48000 + Device 3 + Name: MacBook Pro Speakers + Max input channels: 0 + Default sample rate: 48000 + Device 4 + Name: WeMeet Audio Device + Max input channels: 2 + Default sample rate: 48000 + + Use default device 2 (MacBook Pro Microphone) + StreamParameters [ + device=2 + channelCount=1 + sampleFormat=Float32 + suggestedLatency=0.034520833333333334 + hostApiSpecificStreamInfo?=[False] + ] + Started! Please speak + + 0: THIS IS A TEST + 1: THIS IS A SECOND TEST + +colab +----- + +We provide a colab notebook +|Sherpa-onnx csharp api example colab notebook| +for you to try the ``C#`` API examples of `sherpa-onnx`_. + +.. |Sherpa-onnx csharp api example colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_csharp_api_example.ipynb diff --git a/docs/source/onnx/faqs/change-kotlin-and-java-package-name.rst b/docs/source/onnx/faqs/change-kotlin-and-java-package-name.rst new file mode 100644 index 000000000..09c9cc36f --- /dev/null +++ b/docs/source/onnx/faqs/change-kotlin-and-java-package-name.rst @@ -0,0 +1,37 @@ +How to change the package name of our kotlin and/or java API +============================================================ + +By default, we use: + +.. code-block:: kotlin + + package com.k2fsa.sherpa.onnx + + +If you change our package name without changing the JNI C++ binding code, you would +get errors like: + + - ``_ changes our package + name from ``com.k2fsa.sherpa.onnx`` to ``stt`` and gets the following error: + + .. code-block:: + + No implementation found for + long stt.OfflineRecognizer.newFromAsset(android.content.res.AssetManager, stt.OfflineRecognizerConfig) + (tried Java_stt_OfflineRecognizer_newFromAsset and + Java_stt_OfflineRecognizer_newFromAsset__Landroid_content_res_AssetManager_2Lstt_OfflineRecognizerConfig_2) - + is the library loaded, e.g. System.loadLibrary? + +We suggest that you don't change our package name when using our code. You can use ``import`` +to use our Kotlin or Java API. + +If you are familiar with JNI and really want to change our package name, please have a look +at: + + ``_ + +It shows how to change the package name from ``com.k2fsa.sherpa.onnx`` to ``com.edgeai.chatappv2``. + +.. warning:: + + You need to change a lot of files. diff --git a/docs/source/onnx/faqs/diff-online-offline.rst b/docs/source/onnx/faqs/diff-online-offline.rst new file mode 100644 index 000000000..933729865 --- /dev/null +++ b/docs/source/onnx/faqs/diff-online-offline.rst @@ -0,0 +1,13 @@ +在线、离线、流式、非流式的区别 +============================== + +此项目中,``在线`` 等同于流式,``离线`` 等同于非流式。 + +``在线`` 即流式,是边说边识别;响应速度快、延迟小。 + +``离线`` 即非流式,是把所有待识别的数据,一次性送给模型;特点是需要 +等待所有的数据都到齐, 然后才能开始识别。 + +不管是 ``离线`` 还是 ``在线``, 我们这个项目,都不需要访问网络,都可以在本地 +处理;即使断网,也能正常工作。 + diff --git a/docs/source/onnx/faqs/fix-libasound-module-conf-pulse.rst b/docs/source/onnx/faqs/fix-libasound-module-conf-pulse.rst new file mode 100644 index 000000000..82fc9c9f7 --- /dev/null +++ b/docs/source/onnx/faqs/fix-libasound-module-conf-pulse.rst @@ -0,0 +1,31 @@ +Cannot open shared library libasound_module_conf_pulse.so +========================================================= + +The detailed errors are given below: + +.. code-block:: + + Cannot open shared library libasound_module_conf_pulse.so + (/usr/lib64/alsa-lib/libasound_module_conf_pulse.so: cannot open shared object file: No such file or directory) + ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM + +If you use Linux and get the above error when trying to use the microphone, please do the following: + + 1. Locate where is the file ``libasound_module_conf_pulse.so`` on your system + + .. code-block:: bash + + find / -name libasound_module_conf_pulse.so 2>/dev/null + + 2. If the above search command prints:: + + /usr/lib/x86_64-linux-gnu/alsa-lib/libasound_module_conf_pulse.so + /usr/lib/i386-linux-gnu/alsa-lib/libasound_module_conf_pulse.so + + 3. Please run:: + + sudo mkdir -p /usr/lib64/alsa-lib + sudo ln -s /usr/lib/x86_64-linux-gnu/alsa-lib/libasound_module_conf_pulse.so /usr/lib64/alsa-lib + + 4. Now your issue should be fixed. + diff --git a/docs/source/onnx/faqs/fix-libtoolize.rst b/docs/source/onnx/faqs/fix-libtoolize.rst new file mode 100644 index 000000000..d47e546e5 --- /dev/null +++ b/docs/source/onnx/faqs/fix-libtoolize.rst @@ -0,0 +1,13 @@ +./gitcompile: line 89: libtoolize: command not found +==================================================== + +If you are using Linux and get the following error: + +.. code-block:: + + ./gitcompile: line 89: libtoolize: command not found + +Please run:: + + sudo apt-get install libtool + diff --git a/docs/source/onnx/faqs/fix-tts-encoding-for-chinese-models.rst b/docs/source/onnx/faqs/fix-tts-encoding-for-chinese-models.rst new file mode 100644 index 000000000..1057ee2fe --- /dev/null +++ b/docs/source/onnx/faqs/fix-tts-encoding-for-chinese-models.rst @@ -0,0 +1,6 @@ +TTS 中文模型没有声音 +==================== + +Please see :ref:`how_to_enable_utf8_on_windows`. +You need to use ``UTF-8`` encoding for your system. + diff --git a/docs/source/onnx/faqs/index.rst b/docs/source/onnx/faqs/index.rst new file mode 100644 index 000000000..895934547 --- /dev/null +++ b/docs/source/onnx/faqs/index.rst @@ -0,0 +1,90 @@ +Frequently Asked Question (FAQs) +================================ + +This page contains frequently asked questions for `sherpa-onnx`_. + +.. toctree:: + :maxdepth: 5 + + ./diff-online-offline.rst + ./change-kotlin-and-java-package-name.rst + ./fix-libasound-module-conf-pulse.rst + ./fix-tts-encoding-for-chinese-models.rst + ./fix-libtoolize.rst + + +OSError: PortAudio library not found +------------------------------------ + +If you have the following error on Linux (Ubuntu), + +.. code-block:: bash + + Traceback (most recent call last): + File "/mnt/sdb/shared/sherpa-onnx/./python-api-examples/vad-microphone.py", line 8, in + import sounddevice as sd + File "/mnt/sdb/shared/py311/lib/python3.11/site-packages/sounddevice.py", line 71, in + raise OSError('PortAudio library not found') + OSError: PortAudio library not found + +Then please run:: + + sudo apt-get install libportaudio2 + +and then re-try. + +imports github.com/k2-fsa/sherpa-onnx-go-linux: build constraints exclude all Go files +-------------------------------------------------------------------------------------- + +If you have the following output when running ``go build``:: + + [root@VM-0-3-centos non-streaming-decode-files]# go build + package non-streaming-decode-files + imports github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx + imports github.com/k2-fsa/sherpa-onnx-go-linux: build constraints exclude all Go files in /root/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-linux@v1.9.21 + +Please first run:: + + go env -w CGO_ENABLED=1 + +And then re-run ``go build``. + +External buffers are not allowed +-------------------------------- + +If you are using ``electron >= 21`` and get the following error: + +.. code-block:: + + External buffers are not allowed + +Then please set ``enableExternalBuffer`` to ``false``. + +Specifically, + + - For reading wave files, please use ``sherpa_onnx.readWave(filename, false);``, + where the second argument ``false`` means to not use external buffers + + - For VAD, please use ``vad.get(startIndex, n, false)`` and ``vad.front(false)`` + + - For speaker identification, please use ``extractor.compute(stream, false)`` + + - For TTS, please use: + + .. code-block:: javascript + + const audio = tts.generate({ + text: text, + sid: 0, + speed: 1.0, + enableExternalBuffer: false, + }); + +The given version [17] is not supported, only version 1 to 10 is supported in this build +---------------------------------------------------------------------------------------- + +If you have such an error, please find the file ``onnxruntime.dll`` in your ``C`` drive +and try to remove it. + +The reason is that you have two ``onnxruntime.dll`` on your computer and the one +in your ``C`` drive is outdated. diff --git a/docs/source/onnx/flutter/index.rst b/docs/source/onnx/flutter/index.rst new file mode 100644 index 000000000..f9217ac13 --- /dev/null +++ b/docs/source/onnx/flutter/index.rst @@ -0,0 +1,9 @@ +.. _sherpa-onnx-flutter: + +Flutter +======= + +.. toctree:: + :maxdepth: 2 + + ./pre-built-app.rst diff --git a/docs/source/onnx/flutter/pre-built-app.rst b/docs/source/onnx/flutter/pre-built-app.rst new file mode 100644 index 000000000..9afc34a8c --- /dev/null +++ b/docs/source/onnx/flutter/pre-built-app.rst @@ -0,0 +1,46 @@ +Pre-built Flutter Apps +====================== + +Links for pre-built Apps can be found in the following table: + +.. hint:: + + It runs locally, without internet connection. + +Text to speech (TTS, Speech synthesis) +-------------------------------------- + +.. list-table:: + + * - **** + - 中国用户 + - URL + * - Android (arm64-v8a, armeabi-v7a, x86_64) + - `点这里 `_ + - ``_ + + * - Linux (x64) + - `点这里 `_ + - ``_ + * - macOS (x64) + - `点这里 `_ + - ``_ + * - macOS (arm64) + - `点这里 `_ + - ``_ + * - Windows (x64) + - `点这里 `_ + - ``_ + +Streaming Speech recognition (STT, ASR) +--------------------------------------- + +.. list-table:: + + * - **** + - 中国用户 + - URL + * - Streaming speech recognition + - `点这里 `_ + - ``_ + diff --git a/docs/source/onnx/go-api/index.rst b/docs/source/onnx/go-api/index.rst new file mode 100644 index 000000000..fe67d2eee --- /dev/null +++ b/docs/source/onnx/go-api/index.rst @@ -0,0 +1,429 @@ +.. _sherpa-onnx-go-api: + +Go API +====== + +In this section, we describe how to use the `Go`_ +API of `sherpa-onnx`_. + +The `Go`_ API of `sherpa-onnx`_ supports both streaming and non-streaming speech recognition. + +The following table lists some `Go`_ API examples: + +.. list-table:: + + * - Description + - URL + * - Decode a file with **non-streaming** models + - ``_ + * - Decode a file with **streaming** models + - ``_ + * - **Real-time** speech recognition from a ``microphone`` + - ``_ + +One thing to note is that we have provided pre-built libraries for `Go`_ so that you don't need +to build `sherpa-onnx`_ by yourself when using the `Go`_ API. + +To make supporting multiple platforms easier, we split the `Go`_ API of `sherpa-onnx`_ into +multiple packages, as listed in the following table: + +.. list-table:: + + * - OS + - Package name + - Supported Arch + - Doc + * - Linux + - `sherpa-onnx-go-linux `_ + - ``x86_64``, ``aarch64``, ``arm`` + - ``_ + * - macOS + - `sherpa-onnx-go-macos `_ + - ``x86_64``, ``aarch64`` + - ``_ + * - Windows + - `sherpa-onnx-go-windows `_ + - ``x86_64``, ``x86`` + - ``_ + +To simplify the usage, we have provided a single `Go`_ package for `sherpa-onnx`_ that +supports multiple operating systems. It can be found at + + ``_ + +.. hint:: + + Such a design is insipred by the following article: + + `Cross platform Go modules for giants `_. + +You can use the following ``import`` to import `sherpa-onnx-go`_ +into your `Go`_ project: + +.. code-block:: go + + import ( + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" + ) + +In the following, we describe how to run our provided `Go`_ API examples. + +.. note:: + + Before you continue, please make sure you have installed `Go`_. + If not, please follow ``_ to install `Go`_. + +.. hint:: + + You need to enable `cgo `_ to build `sherpa-onnx-go`_. + +Decode files with non-streaming models +-------------------------------------- + +First, let us build the example: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx/go-api-examples/non-streaming-decode-files + go mod tidy + go build + ./non-streaming-decode-files --help + +You will find the following output: + +.. code-block:: bash + + Usage of ./non-streaming-decode-files: + --debug int Whether to show debug message + --decoder string Path to the decoder model + --decoding-method string Decoding method. Possible values: greedy_search, modified_beam_search (default "greedy_search") + --encoder string Path to the encoder model + --joiner string Path to the joiner model + --lm-model string Optional. Path to the LM model + --lm-scale float32 Optional. Scale for the LM model (default 1) + --max-active-paths int Used only when --decoding-method is modified_beam_search (default 4) + --model-type string Optional. Used for loading the model in a faster way + --nemo-ctc string Path to the NeMo CTC model + --num-threads int Number of threads for computing (default 1) + --paraformer string Path to the paraformer model + --provider string Provider to use (default "cpu") + --tokens string Path to the tokens file + pflag: help requested + +Congratulations! You have successfully built your first `Go`_ API example for speech recognition. + +.. note:: + + If you are using Windows and don't see any output after running ``./non-streaming-decode-files --help``, + please copy ``*.dll`` from ``_ (for Win64) + or ``_ (for Win32) + to the directory ``sherpa-onnx/go-api-examples/non-streaming-decode-files``. + +Now let us refer to :ref:`sherpa-onnx-pre-trained-models` to download a non-streaming model. + +We give several examples below for demonstration. + +Non-streaming transducer +^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa-onnx-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/go-api-examples/non-streaming-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + ./non-streaming-decode-files \ + --encoder ./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder ./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner ./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx \ + --tokens ./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt \ + --model-type transducer \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav + +It should give you the following output: + +.. code-block:: bash + + 2023/08/10 14:52:48.723098 Reading ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav + 2023/08/10 14:52:48.741042 Initializing recognizer (may take several seconds) + 2023/08/10 14:52:51.998848 Recognizer created! + 2023/08/10 14:52:51.998870 Start decoding! + 2023/08/10 14:52:52.258818 Decoding done! + 2023/08/10 14:52:52.258847 after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels + 2023/08/10 14:52:52.258952 Wave duration: 6.625 seconds + +Non-streaming paraformer +^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa_onnx_offline_paraformer_zh_2023_03_28_chinese` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/go-api-examples/non-streaming-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +Now we can use: + +.. code-block:: bash + + ./non-streaming-decode-files \ + --paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --model-type paraformer \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav + +It should give you the following output: + +.. code-block:: bash + + 2023/08/10 15:07:10.745412 Reading ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav + 2023/08/10 15:07:10.758414 Initializing recognizer (may take several seconds) + 2023/08/10 15:07:13.992424 Recognizer created! + 2023/08/10 15:07:13.992441 Start decoding! + 2023/08/10 15:07:14.382157 Decoding done! + 2023/08/10 15:07:14.382847 对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你 + 2023/08/10 15:07:14.382898 Wave duration: 5.614625 seconds + +Non-streaming CTC model from NeMo +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`stt-en-conformer-ctc-medium-nemo-sherpa-onnx` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/go-api-examples/non-streaming-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + tar xvf sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + +Now we can use: + +.. code-block:: bash + + ./non-streaming-decode-files \ + --nemo-ctc ./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \ + --tokens ./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt \ + --model-type nemo_ctc \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + +It should give you the following output: + +.. code-block:: bash + + 2023/08/10 15:11:48.667693 Reading ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + 2023/08/10 15:11:48.680855 Initializing recognizer (may take several seconds) + 2023/08/10 15:11:51.900852 Recognizer created! + 2023/08/10 15:11:51.900869 Start decoding! + 2023/08/10 15:11:52.125605 Decoding done! + 2023/08/10 15:11:52.125630 after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels + 2023/08/10 15:11:52.125645 Wave duration: 6.625 seconds + +Decode files with streaming models +---------------------------------- + +First, let us build the example: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx/go-api-examples/streaming-decode-files + go mod tidy + go build + ./streaming-decode-files --help + +You will find the following output: + +.. code-block:: bash + + Usage of ./streaming-decode-files: + --debug int Whether to show debug message + --decoder string Path to the decoder model + --decoding-method string Decoding method. Possible values: greedy_search, modified_beam_search (default "greedy_search") + --encoder string Path to the encoder model + --joiner string Path to the joiner model + --max-active-paths int Used only when --decoding-method is modified_beam_search (default 4) + --model-type string Optional. Used for loading the model in a faster way + --num-threads int Number of threads for computing (default 1) + --provider string Provider to use (default "cpu") + --tokens string Path to the tokens file + pflag: help requested + +.. note:: + + If you are using Windows and don't see any output after running ``./streaming-decode-files --help``, + please copy ``*.dll`` from ``_ (for Win64) + or ``_ (for Win32) + to the directory ``sherpa-onnx/go-api-examples/streaming-decode-files``. + +Now let us refer to :ref:`sherpa-onnx-pre-trained-models` to download a streaming model. + +We give one example below for demonstration. + +Streaming transducer +^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa-onnx-streaming-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/go-api-examples/streaming-decode-files + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + ./streaming-decode-files \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --model-type zipformer2 \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + +It should give you the following output: + +.. code-block:: bash + + 2023/08/10 15:17:00.226228 Reading ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + 2023/08/10 15:17:00.241024 Initializing recognizer (may take several seconds) + 2023/08/10 15:17:03.352697 Recognizer created! + 2023/08/10 15:17:03.352711 Start decoding! + 2023/08/10 15:17:04.057130 Decoding done! + 2023/08/10 15:17:04.057215 after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels + 2023/08/10 15:17:04.057235 Wave duration: 6.625 seconds + +Real-time speech recognition from microphone +-------------------------------------------- + +.. hint:: + + You need to install ``portaudio`` for this example. + + .. code-block:: bash + + # for macOS + brew install portaudio + export PKG_CONFIG_PATH=/usr/local/Cellar/portaudio/19.7.0 + + # for Ubuntu + sudo apt-get install libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 + + To check that you have installed ``portaudio`` successfully, please run: + + .. code-block:: bash + + pkg-config --cflags --libs portaudio-2.0 + + It should give you something like below: + + .. code-block:: bash + + # for macOS + -I/usr/local/Cellar/portaudio/19.7.0/include -L/usr/local/Cellar/portaudio/19.7.0/lib -lportaudio -framework CoreAudio -framework AudioToolbox -framework AudioUnit -framework CoreFoundation -framework CoreServices + + # for Ubuntu + -pthread -lportaudio -lasound -lm -lpthread + + +First, let us build the example: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx/go-api-examples/real-time-speech-recognition-from-microphone + go mod tidy + go build + ./real-time-speech-recognition-from-microphone --help + +You will find the following output: + +.. code-block:: bash + + Select default input device: MacBook Pro Microphone + Usage of ./real-time-speech-recognition-from-microphone: + --debug int Whether to show debug message + --decoder string Path to the decoder model + --decoding-method string Decoding method. Possible values: greedy_search, modified_beam_search (default "greedy_search") + --enable-endpoint int Whether to enable endpoint (default 1) + --encoder string Path to the encoder model + --joiner string Path to the joiner model + --max-active-paths int Used only when --decoding-method is modified_beam_search (default 4) + --model-type string Optional. Used for loading the model in a faster way + --num-threads int Number of threads for computing (default 1) + --provider string Provider to use (default "cpu") + --rule1-min-trailing-silence float32 Threshold for rule1 (default 2.4) + --rule2-min-trailing-silence float32 Threshold for rule2 (default 1.2) + --rule3-min-utterance-length float32 Threshold for rule3 (default 20) + --tokens string Path to the tokens file + pflag: help requested + +Now let us refer to :ref:`sherpa-onnx-pre-trained-models` to download a streaming model. + +We give one example below for demonstration. + +Streaming transducer +^^^^^^^^^^^^^^^^^^^^ + +We will use :ref:`sherpa-onnx-streaming-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd sherpa-onnx/go-api-examples/real-time-speech-recognition-from-microphone + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + ./real-time-speech-recognition-from-microphone \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --model-type zipformer2 + +It should give you the following output: + +.. code-block:: bash + + Select default input device: MacBook Pro Microphone + 2023/08/10 15:22:00 Initializing recognizer (may take several seconds) + 2023/08/10 15:22:03 Recognizer created! + Started! Please speak + 0: this is the first test + 1: this is the second + +colab +----- + +We provide a colab notebook +|Sherpa-onnx go api example colab notebook| +for you to try the `Go`_ API examples of `sherpa-onnx`_. + +.. |Sherpa-onnx go api example colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_go_api_example.ipynb diff --git a/docs/source/onnx/harmony-os/how-to-build-har.rst b/docs/source/onnx/harmony-os/how-to-build-har.rst new file mode 100644 index 000000000..f12a300cd --- /dev/null +++ b/docs/source/onnx/harmony-os/how-to-build-har.rst @@ -0,0 +1,181 @@ +How to build sherpa_onnx.har +============================ + +This page describes how to build ``sherpa_onnx.har`` from source. + +Note that we have already published `sherpa-onnx`_ at the following address: + + ``_ + +.. figure:: ./pic/ohpm-package.jpg + :alt: Screenshot of the ohpm package + :width: 600 + + The `sherpa_onnx `_ package. + +You can use it directly in your project by modifying ``oh-package.json5`` to add +the following lines: + +.. code-block:: + + "dependencies": { + // please always use the latest version + "sherpa_onnx": "1.10.33", + }, + +or use: + +.. code-block:: + + ohpm install sherpa_onnx + +.. hint:: + + If you don't want to change any C++ code of `sherpa-onnx`_, then please + use our provided ``sherpa_onnx.har`` package and you can safely ignore this + document. + +.. hint:: + + If you don't want to change any C++ code of `sherpa-onnx`_, then please + use our provided ``sherpa_onnx.har`` package and you can safely ignore this + document. + +.. hint:: + + If you don't want to change any C++ code of `sherpa-onnx`_, then please + use our provided ``sherpa_onnx.har`` package and you can safely ignore this + document. + +If you want to modify the source code of `sherpa-onnx`_, then you can follow +this document to build a new ``sherpa_onnx.har`` package. + +Download commandline-tools +-------------------------- + +The first step is to download commandline tools for building `sherpa-onnx`_. + +Please visit ``_ +to download it. Note that you need a Huawei account to download it. + + +.. figure:: ./pic/ohos-cmd-tools.jpg + :alt: Screenshot of the download the commandline-tools + :width: 600 + + Download commandline-tools. + +Alternatively, you can download it from the following huggingface repo + + ``_ + +with the following command: + +.. code-block:: bash + + # Please use any directory you like. + # The following one is just an example. + mkdir -p /Users/fangjun/software/ + + cd /Users/fangjun/software/ + + # If you use other systems, please change it accordingly. + # If you use macOS x64, please run the following + wget https://huggingface.co/csukuangfj/harmonyos-commandline-tools/resolve/main/commandline-tools-mac-x64-5.0.5.200.zip + + # For users that don't have access to huggingface, please use + # wget https://hf-mirror.com/csukuangfj/harmonyos-commandline-tools/resolve/main/commandline-tools-mac-x64-5.0.5.200.zip + + unzip -qq commandline-tools-mac-x64-5.0.5.200.zip + + +Build sherpa-onnx for HarmonyOS +------------------------------- + +Please use the following command: + +.. code-block:: + + cd /Users/fangjun/open-source + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + export OHOS_SDK_NATIVE_DIR=/Users/fangjun/software/command-line-tools/sdk/default/openharmony/native/ + + ./build-ohos-arm64-v8a.sh + ./build-ohos-x86-64.sh + + +Build sherpa_onnx.har +--------------------- + +Finally, we can build ``sherpa_onnx.har``. + +We describe two methods below. + +From the command-line +^^^^^^^^^^^^^^^^^^^^^ + +You can build ``sherpa_onnx.har`` from the terminal by running: + +.. code-block:: + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxHar/ + export PATH=/Users/fangjun/software/command-line-tools/bin:$PATH + + hvigorw clean --no-daemon + hvigorw --mode module -p product=default -p module=sherpa_onnx@default assembleHar --analyze=normal --parallel --incremental --no-daemon + + find . -name "*.har" + +After building, you should get:: + + (py38) fangjuns-MacBook-Pro:SherpaOnnxHar fangjun$ echo $PWD + /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxHar + (py38) fangjuns-MacBook-Pro:SherpaOnnxHar fangjun$ find . -name "*.har" + ./sherpa_onnx/build/default/outputs/default/sherpa_onnx.har + + +Use DevEco Studio +^^^^^^^^^^^^^^^^^ + +Start DevEco Studio and open the project `SherpaOnnxHar `_ + +Follow the screenshot below to build ``sherpa_onnx.har``. + +.. figure:: ./pic/build-har-gui.jpg + :alt: Screenshot of building sherpa_onnx.har using DevEco Studio + :width: 600 + + Screenshot of building sherpa_onnx.har using DevEco Studio + +The following screenshot shows where you can find the generated ``sherpa_onnx.har``: + +.. figure:: ./pic/sherpa_onnx_har_gui.jpg + :alt: Screenshot of generated sherap_onnx.har + :width: 600 + + Screenshot of the generated ``sherpa_onnx.har``. + +Use sherpa_onnx.har in your project +----------------------------------- + +To use the generated ``sherpa_onnx.har`` in your project, please copy it to your +project and update the ``oh-package.json`` file. + +The following is an example: + +.. figure:: ./pic/sherpa_onnx_har_usage.jpg + :alt: Screenshot of using sherap_onnx.har in your project + :width: 600 + + Screenshot of using ``sherpa_onnx.har`` in your project. + +Colab demo +---------- + +We provide a colab notebook |build sherpa_onnx.har for HarmonyOS notebook| to show you how to build ``sherpa_onnx.har`` +from scratch. + +.. |build sherpa_onnx.har for HarmonyOS notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/harmony-os/sherpa_onnx_harmonyos_build_har.ipynb diff --git a/docs/source/onnx/harmony-os/index.rst b/docs/source/onnx/harmony-os/index.rst new file mode 100644 index 000000000..3402f75ef --- /dev/null +++ b/docs/source/onnx/harmony-os/index.rst @@ -0,0 +1,18 @@ +HarmonyOS +========= + +In this section, we describe how to build an HarmonyOS app with `sherpa-onnx`_. + +.. hint:: + + For real-time speech recognition, it does not need to access the Internet. + Everyting is processed locally on your phone. + +.. toctree:: + :maxdepth: 4 + + ./prebuilt-hap.rst + ./speaker-identification.rst + ./tts.rst + ./vad-asr.rst + ./how-to-build-har.rst diff --git a/docs/source/onnx/harmony-os/pic/build-har-gui.jpg b/docs/source/onnx/harmony-os/pic/build-har-gui.jpg new file mode 100644 index 000000000..ff9937ba8 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/build-har-gui.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/ohos-cmd-tools.jpg b/docs/source/onnx/harmony-os/pic/ohos-cmd-tools.jpg new file mode 100644 index 000000000..1efe95c3c Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/ohos-cmd-tools.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/ohpm-package.jpg b/docs/source/onnx/harmony-os/pic/ohpm-package.jpg new file mode 100644 index 000000000..1b6414bd6 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/ohpm-package.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sherpa_onnx_har_gui.jpg b/docs/source/onnx/harmony-os/pic/sherpa_onnx_har_gui.jpg new file mode 100644 index 000000000..ad87bc06e Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sherpa_onnx_har_gui.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sherpa_onnx_har_usage.jpg b/docs/source/onnx/harmony-os/pic/sherpa_onnx_har_usage.jpg new file mode 100644 index 000000000..96270240e Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sherpa_onnx_har_usage.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/10-click-allow.jpg b/docs/source/onnx/harmony-os/pic/sid/10-click-allow.jpg new file mode 100644 index 000000000..3242dd409 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/10-click-allow.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/11-home.jpg b/docs/source/onnx/harmony-os/pic/sid/11-home.jpg new file mode 100644 index 000000000..5fe2f45a0 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/11-home.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/12-view.jpg b/docs/source/onnx/harmony-os/pic/sid/12-view.jpg new file mode 100644 index 000000000..8fdc58e3a Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/12-view.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/13-add.jpg b/docs/source/onnx/harmony-os/pic/sid/13-add.jpg new file mode 100644 index 000000000..c8b2f2407 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/13-add.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/14-help.jpg b/docs/source/onnx/harmony-os/pic/sid/14-help.jpg new file mode 100644 index 000000000..54978610e Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/14-help.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/2-select-open.jpg b/docs/source/onnx/harmony-os/pic/sid/2-select-open.jpg new file mode 100644 index 000000000..0c0cddb99 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/2-select-open.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/3-check-version.jpg b/docs/source/onnx/harmony-os/pic/sid/3-check-version.jpg new file mode 100644 index 000000000..1cc0a255c Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/3-check-version.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/4-check-model.jpg b/docs/source/onnx/harmony-os/pic/sid/4-check-model.jpg new file mode 100644 index 000000000..f1f8100b1 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/4-check-model.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/5-change-the-code.jpg b/docs/source/onnx/harmony-os/pic/sid/5-change-the-code.jpg new file mode 100644 index 000000000..f62a0c7d2 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/5-change-the-code.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/6-build-the-project.jpg b/docs/source/onnx/harmony-os/pic/sid/6-build-the-project.jpg new file mode 100644 index 000000000..7405e46d4 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/6-build-the-project.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/7-select-device-manager.jpg b/docs/source/onnx/harmony-os/pic/sid/7-select-device-manager.jpg new file mode 100644 index 000000000..b7c7c7454 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/7-select-device-manager.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/8-start-emulator.jpg b/docs/source/onnx/harmony-os/pic/sid/8-start-emulator.jpg new file mode 100644 index 000000000..32c7e21bc Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/8-start-emulator.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/sid/9-start-app.jpg b/docs/source/onnx/harmony-os/pic/sid/9-start-app.jpg new file mode 100644 index 000000000..079f76b3f Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/sid/9-start-app.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/1-open.jpg b/docs/source/onnx/harmony-os/pic/tts/1-open.jpg new file mode 100644 index 000000000..b2860d653 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/1-open.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/10-started-libritts.jpg b/docs/source/onnx/harmony-os/pic/tts/10-started-libritts.jpg new file mode 100644 index 000000000..904613ac3 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/10-started-libritts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/10-started.jpg b/docs/source/onnx/harmony-os/pic/tts/10-started.jpg new file mode 100644 index 000000000..8a433801f Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/10-started.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/2-select-open.jpg b/docs/source/onnx/harmony-os/pic/tts/2-select-open.jpg new file mode 100644 index 000000000..f96af8f05 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/2-select-open.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/3-check-version.jpg b/docs/source/onnx/harmony-os/pic/tts/3-check-version.jpg new file mode 100644 index 000000000..9042e5b88 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/3-check-version.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/4-check-dir-libritts.jpg b/docs/source/onnx/harmony-os/pic/tts/4-check-dir-libritts.jpg new file mode 100644 index 000000000..544dd2f5c Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/4-check-dir-libritts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/4-check-dir-melo-tts.jpg b/docs/source/onnx/harmony-os/pic/tts/4-check-dir-melo-tts.jpg new file mode 100644 index 000000000..6a155c1b6 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/4-check-dir-melo-tts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/5-change-code-for-libritts.jpg b/docs/source/onnx/harmony-os/pic/tts/5-change-code-for-libritts.jpg new file mode 100644 index 000000000..d1e46bcbe Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/5-change-code-for-libritts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/5-change-code-for-melo-tts.jpg b/docs/source/onnx/harmony-os/pic/tts/5-change-code-for-melo-tts.jpg new file mode 100644 index 000000000..86d6cbc70 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/5-change-code-for-melo-tts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/6-build-melo-tts.jpg b/docs/source/onnx/harmony-os/pic/tts/6-build-melo-tts.jpg new file mode 100644 index 000000000..afab6583d Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/6-build-melo-tts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/6-build-the-project-for-libritts.jpg b/docs/source/onnx/harmony-os/pic/tts/6-build-the-project-for-libritts.jpg new file mode 100644 index 000000000..2399e8f0c Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/6-build-the-project-for-libritts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/7-click-device-manager-libritts.jpg b/docs/source/onnx/harmony-os/pic/tts/7-click-device-manager-libritts.jpg new file mode 100644 index 000000000..bc3409cf6 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/7-click-device-manager-libritts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/7-click-device-manager.jpg b/docs/source/onnx/harmony-os/pic/tts/7-click-device-manager.jpg new file mode 100644 index 000000000..91c338a88 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/7-click-device-manager.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/8-start-emulator.jpg b/docs/source/onnx/harmony-os/pic/tts/8-start-emulator.jpg new file mode 100644 index 000000000..79c655c42 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/8-start-emulator.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/9-start-the-app-libritts.jpg b/docs/source/onnx/harmony-os/pic/tts/9-start-the-app-libritts.jpg new file mode 100644 index 000000000..d538fbadf Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/9-start-the-app-libritts.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/tts/9-start-the-app.jpg b/docs/source/onnx/harmony-os/pic/tts/9-start-the-app.jpg new file mode 100644 index 000000000..dba3f028e Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/tts/9-start-the-app.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/10-allow-mic-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/10-allow-mic-moonshine.jpg new file mode 100644 index 000000000..226af8efd Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/10-allow-mic-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/10-click-allow-mic-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/10-click-allow-mic-sense-voice.jpg new file mode 100644 index 000000000..b44558fb2 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/10-click-allow-mic-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/11-select-a-file-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/11-select-a-file-sense-voice.jpg new file mode 100644 index 000000000..01d1e70e5 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/11-select-a-file-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/11-select-file-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/11-select-file-moonshine.jpg new file mode 100644 index 000000000..fa95b3ca5 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/11-select-file-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/12-start-mic-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/12-start-mic-moonshine.jpg new file mode 100644 index 000000000..68301b35e Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/12-start-mic-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/12-start-mic-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/12-start-mic-sense-voice.jpg new file mode 100644 index 000000000..9b71927de Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/12-start-mic-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/2-select-and-open.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/2-select-and-open.jpg new file mode 100644 index 000000000..32e7bb864 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/2-select-and-open.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/3-check-version.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/3-check-version.jpg new file mode 100644 index 000000000..36daea035 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/3-check-version.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/4-check-dir-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/4-check-dir-moonshine.jpg new file mode 100644 index 000000000..e56912ac9 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/4-check-dir-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/4-check-dir-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/4-check-dir-sense-voice.jpg new file mode 100644 index 000000000..b2bbbc6e2 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/4-check-dir-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-moonshine.jpg new file mode 100644 index 000000000..916b2277b Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-sense-voice-2.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-sense-voice-2.jpg new file mode 100644 index 000000000..c0af7643c Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-sense-voice-2.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-sense-voice.jpg new file mode 100644 index 000000000..f36d2fd56 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/5-change-code-for-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/6-build-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/6-build-moonshine.jpg new file mode 100644 index 000000000..257c7acc3 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/6-build-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/7-select-device-manager-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/7-select-device-manager-sense-voice.jpg new file mode 100644 index 000000000..4e116d754 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/7-select-device-manager-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/7-select-device-manager.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/7-select-device-manager.jpg new file mode 100644 index 000000000..cae8b6fa7 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/7-select-device-manager.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/8-start-emulator-for-moonshine.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/8-start-emulator-for-moonshine.jpg new file mode 100644 index 000000000..cb9dead4c Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/8-start-emulator-for-moonshine.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/9-start-app-emulator.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/9-start-app-emulator.jpg new file mode 100644 index 000000000..4e1f75b72 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/9-start-app-emulator.jpg differ diff --git a/docs/source/onnx/harmony-os/pic/vad-asr/9-start-app-sense-voice.jpg b/docs/source/onnx/harmony-os/pic/vad-asr/9-start-app-sense-voice.jpg new file mode 100644 index 000000000..aeda7a1f7 Binary files /dev/null and b/docs/source/onnx/harmony-os/pic/vad-asr/9-start-app-sense-voice.jpg differ diff --git a/docs/source/onnx/harmony-os/prebuilt-hap.rst b/docs/source/onnx/harmony-os/prebuilt-hap.rst new file mode 100644 index 000000000..a5ab06bf3 --- /dev/null +++ b/docs/source/onnx/harmony-os/prebuilt-hap.rst @@ -0,0 +1,17 @@ +Pre-built HAPs +============== + +Links for pre-built HAPs can be found in the following table: + +.. hint:: + + It runs locally, without internet connection. + +.. list-table:: + + * - **** + - 中国用户 + - URL + * - VAD + non-streaming speech recognition + - `点这里 `_ + - ``_ diff --git a/docs/source/onnx/harmony-os/speaker-identification.rst b/docs/source/onnx/harmony-os/speaker-identification.rst new file mode 100644 index 000000000..600f3ef40 --- /dev/null +++ b/docs/source/onnx/harmony-os/speaker-identification.rst @@ -0,0 +1,163 @@ +On-device speaker identification (本地说话人识别) +======================================================= + +This page describes how to +build `SherpaOnnxSpeakerIdentification `_ +for on-device speaker identification that runs on HarmonyOS. + +Open the project with DevEco Studio +----------------------------------- + +You need to first download the code:: + + # Assume we place it inside /Users/fangjun/open-source + # You can place it anywhere you like. + + cd /Users/fangjun/open-source/ + + git clone https://github.com/k2-fsa/sherpa-onnx + +Then start DevEco Studio and follow the screenshots below: + + +.. figure:: ./pic/tts/1-open.jpg + :alt: Screenshot of starting DevEco + :width: 600 + + Step 1: Click Open + +.. figure:: ./pic/sid/2-select-open.jpg + :alt: Screenshot of selecting SherpaOnnxSpeakerIdentification to open + :width: 600 + + Step 2: Select SherpaOnnxSpeakerIdentification inside the harmony-os folder and click Open + +.. figure:: ./pic/sid/3-check-version.jpg + :alt: Screenshot of check version + :width: 600 + + Step 3: Check that it is using the latest version. You can visit `sherpa_onnx `_ to check available versions. + +Select a model +-------------- + +The code supports many models for extracting speaker embeddings and you have to select +one. + +You can find all supported models at + + ``_ + +We use the following model + + ``_ + +as an example in this document. + +Use 3dspeaker_speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +First, we download it to the `rawfile `_ directory. + +**Caution**: You MUST place the file inside the `rawfile `_ directory. Otherwise, you would be ``SAD`` later. + +.. code-block:: bash + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxSpeakerIdentification/entry/src/main/resources/rawfile + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx + +Please check that your directory looks ``exactly`` like the following: + +.. code-block:: + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxSpeakerIdentification/entry/src/main/resources/rawfile + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls -lh + total 77888 + -rw-r--r-- 1 fangjun staff 38M Oct 14 11:41 3dspeaker_speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx + +.. figure:: ./pic/sid/4-check-model.jpg + :alt: Screenshot of model file inside rawfile directory + :width: 400 + + Step 4: Check that you have placed the model file inside the ``rawfile`` directory. + +Now we need to change the code to use our selected model. + +We have to change `SpeakerIdentificationWorker.ets `_. Please see the following +screenshot. + +.. figure:: ./pic/sid/5-change-the-code.jpg + :alt: Screenshot of changing code for the selected model. + :width: 600 + + Step 5: Change the code to use our selected model + +Finally, we can build the project. See the screenshot below: + +.. figure:: ./pic/sid/6-build-the-project.jpg + :alt: Screenshot of building the project + :width: 600 + + Step 6: Build the project + +If you have an emulator, you can now start it. + +.. figure:: ./pic/sid/7-select-device-manager.jpg + :alt: Screenshot of selecting device manager + :width: 600 + + Step 7: Select the device manager + +.. figure:: ./pic/sid/8-start-emulator.jpg + :alt: Screenshot of starting the emulator + :width: 600 + + Step 8: Start the emulator + +After the emulator is started, follow the screenshot below to run the app on the +emulator: + +.. figure:: ./pic/sid/9-start-app.jpg + :alt: Screenshot of starting the app on the emulator + :width: 600 + + Step 9: Start the app on the emulator + +You should see something like below: + +.. figure:: ./pic/sid/10-click-allow.jpg + :alt: Screenshot of app running on the emulator + :width: 600 + + Step 10: Click Allow to allow the app to access the microphone + +.. figure:: ./pic/sid/11-home.jpg + :alt: Screenshot of app running on the emulator + :width: 300 + + Step 11: The home screen + +.. figure:: ./pic/sid/12-view.jpg + :alt: Screenshot of app running on the emulator + :width: 300 + + Step 12: View and manage registered speakers + +.. figure:: ./pic/sid/13-add.jpg + :alt: Screenshot of app running on the emulator + :width: 300 + + Step 13: Add new speakers + +.. figure:: ./pic/sid/14-help.jpg + :alt: Screenshot of app running on the emulator + :width: 300 + + Step 14: View help information + +Congratulations! + +You have successfully run a on-device speaker identification APP on HarmonyOS! diff --git a/docs/source/onnx/harmony-os/tts.rst b/docs/source/onnx/harmony-os/tts.rst new file mode 100644 index 000000000..2a70a6540 --- /dev/null +++ b/docs/source/onnx/harmony-os/tts.rst @@ -0,0 +1,254 @@ +On-device text-to-speech (TTS) +============================== + +This page describes how to build `SherpaOnnxTts `_ +for on-device text-to-speech that runs on HarmonyOS. + +Open the project with DevEco Studio +----------------------------------- + +You need to first download the code:: + + # Assume we place it inside /Users/fangjun/open-source + # You can place it anywhere you like. + + cd /Users/fangjun/open-source/ + + git clone https://github.com/k2-fsa/sherpa-onnx + +Then start DevEco Studio and follow the screenshots below: + + +.. figure:: ./pic/tts/1-open.jpg + :alt: Screenshot of starting DevEco + :width: 600 + + Step 1: Click Open + + +.. figure:: ./pic/tts/2-select-open.jpg + :alt: Screenshot of selecting SherpaOnnxTts to open + :width: 600 + + Step 2: Select SherpaOnnxTts inside the harmony-os folder and click Open + +.. figure:: ./pic/tts/3-check-version.jpg + :alt: Screenshot of check version + :width: 600 + + Step 3: Check that it is using the latest version. You can visit `sherpa_onnx `_ to check available versions. + +Select a text-to-speech model +----------------------------- + +The code supports hundreds of text-to-speech models from + + ``_ + +and we have to modify the code to use the model that we choose. + +.. hint:: + + You can try all of the above models at the following huggingface space: + + ``_ + +We give two examples below about how to use the following two models: + + - :ref:`vits-melo-tts-zh_en` + - :ref:`vits-piper-en_US-libritts_r-medium` + +Use vits-melo-tts-zh_en +^^^^^^^^^^^^^^^^^^^^^^^ + +First, we download and unzip the model. + +``Caution``: The model MUST be placed inside the directory `rawfile `_. + + +.. code-block:: bash + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxTts/entry/src/main/resources/rawfile + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2 + tar xvf vits-melo-tts-zh_en.tar.bz2 + rm vits-melo-tts-zh_en.tar.bz2 + + # Now remove extra files to save space + rm vits-melo-tts-zh_en/model.int8.onnx + rm vits-melo-tts-zh_en/new_heteronym.fst + +Please check that your directory looks ``exactly`` like the following: + +.. code-block:: bash + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxTts/entry/src/main/resources/rawfile + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls + vits-melo-tts-zh_en + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls -lh vits-melo-tts-zh_en/ + total 346848 + -rw-r--r-- 1 fangjun staff 1.0K Aug 3 11:11 LICENSE + -rw-r--r-- 1 fangjun staff 156B Aug 3 11:11 README.md + -rw-r--r-- 1 fangjun staff 58K Aug 3 11:11 date.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 2024 dict + -rw-r--r-- 1 fangjun staff 6.5M Sep 27 14:19 lexicon.txt + -rw-r--r-- 1 fangjun staff 163M Aug 3 11:11 model.onnx + -rw-r--r-- 1 fangjun staff 63K Aug 3 11:11 number.fst + -rw-r--r-- 1 fangjun staff 87K Aug 3 11:11 phone.fst + -rw-r--r-- 1 fangjun staff 655B Aug 3 11:11 tokens.txt + +Now you should see the following inside DevEco Studio: + +.. figure:: ./pic/tts/4-check-dir-melo-tts.jpg + :alt: Screenshot of vits-melo-tts-zh_en inside rawfile + :width: 600 + + Step 4: Check the model directory inside the ``rawfile`` directory. + +Now it is time to modify the code to use our model. + +We need to change `NonStreamingTtsWorker.ets `_. + +.. figure:: ./pic/tts/5-change-code-for-melo-tts.jpg + :alt: Screenshot of changing code for vits-melo-tts-zh_en + :width: 600 + + Step 5: Change the code to use our selected model + +Finally, we can build the project. See the screenshot below: + +.. figure:: ./pic/tts/6-build-melo-tts.jpg + :alt: Screenshot of building the project + :width: 600 + + Step 6: Build the project + +If you have an emulator, you can now start it. + +.. figure:: ./pic/tts/7-click-device-manager.jpg + :alt: Screenshot of selecting device manager + :width: 600 + + Step 7: Select the device manager + + +.. figure:: ./pic/tts/8-start-emulator.jpg + :alt: Screenshot of starting the emulator + :width: 600 + + Step 8: Start the emulator + +After the emulator is started, follow the screenshot below to run the app on the +emulator: + +.. figure:: ./pic/tts/9-start-the-app.jpg + :alt: Screenshot of starting the app on the emulator + :width: 600 + + Step 9: Start the app on the emulator + +You should see something like below: + +.. figure:: ./pic/tts/10-started.jpg + :alt: Screenshot of app running on the emulator + :width: 600 + + Step 10: The app is running on the emulator + +Congratulations! + +You have successfully run a on-device text-to-speech APP on HarmonyOS! + +Use vits-piper-en_US-libritts_r-medium +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +First, we download and unzip the model. + +``Caution``: The model MUST be placed inside the directory `rawfile `_. + + +.. code-block:: bash + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxTts/entry/src/main/resources/rawfile + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 + tar xvf vits-piper-en_US-libritts_r-medium.tar.bz2 + rm xvf vits-piper-en_US-libritts_r-medium.tar.bz2 + +Please check that your directory looks ``exactly`` like the following: + +.. code-block:: bash + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxTts/entry/src/main/resources/rawfile + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls + vits-piper-en_US-libritts_r-medium + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls -lh vits-piper-en_US-libritts_r-medium/ + total 153552 + -rw-r--r-- 1 fangjun staff 279B Nov 29 2023 MODEL_CARD + -rw-r--r-- 1 fangjun staff 75M Nov 29 2023 en_US-libritts_r-medium.onnx + -rw-r--r-- 1 fangjun staff 20K Nov 29 2023 en_US-libritts_r-medium.onnx.json + drwxr-xr-x 122 fangjun staff 3.8K Nov 28 2023 espeak-ng-data + -rw-r--r-- 1 fangjun staff 954B Nov 29 2023 tokens.txt + -rwxr-xr-x 1 fangjun staff 1.8K Nov 29 2023 vits-piper-en_US.py + -rwxr-xr-x 1 fangjun staff 730B Nov 29 2023 vits-piper-en_US.sh + +Now you should see the following inside DevEco Studio: + +.. figure:: ./pic/tts/4-check-dir-libritts.jpg + :alt: Screenshot of vits-piper-en_US-libritts_r-medium inside rawfile + :width: 600 + + Step 4: Check the model directory inside the ``rawfile`` directory. + +Now it is time to modify the code to use our model. + +We need to change `NonStreamingTtsWorker.ets `_. + +.. figure:: ./pic/tts/5-change-code-for-libritts.jpg + :alt: Screenshot of changing code for vits-piper-en_US-libritts_r-medium + :width: 600 + + Step 5: Change the code to use our selected model + +Finally, we can build the project. See the screenshot below: + +.. figure:: ./pic/tts/6-build-the-project-for-libritts.jpg + :alt: Screenshot of changing code for vits-piper-en_US-libritts_r-medium + :width: 600 + + Step 6: Build the project + +If you have an emulator, you can now start it. + +.. figure:: ./pic/tts/7-click-device-manager-libritts.jpg + :alt: Screenshot of selecting device manager + :width: 600 + + Step 7: Select the device manager + +.. figure:: ./pic/tts/8-start-emulator.jpg + :alt: Screenshot of starting the emulator + :width: 600 + + Step 8: Start the emulator + +After the emulator is started, follow the screenshot below to run the app on the +emulator: + +.. figure:: ./pic/tts/9-start-the-app-libritts.jpg + :alt: Screenshot of starting the app on the emulator + :width: 600 + + Step 9: Start the app on the emulator + +You should see something like below: + +.. figure:: ./pic/tts/10-started-libritts.jpg + :alt: Screenshot of app running on the emulator + :width: 600 + + Step 10: The app is running on the emulator + +Congratulations! + +You have successfully run a on-device text-to-speech APP on HarmonyOS! diff --git a/docs/source/onnx/harmony-os/vad-asr.rst b/docs/source/onnx/harmony-os/vad-asr.rst new file mode 100644 index 000000000..57659673c --- /dev/null +++ b/docs/source/onnx/harmony-os/vad-asr.rst @@ -0,0 +1,320 @@ +On-device VAD + ASR +=================== + +This page describes how to build `SherpaOnnxVadAsr `_ +for on-device non-streaming speech recognition that runs on HarmonyOS. + +.. hint:: + + This page is for non-streaming models. + + This page is NOT for streaming models. + +Open the project with DevEco Studio +----------------------------------- + +You need to first download the code:: + + # Assume we place it inside /Users/fangjun/open-source + # You can place it anywhere you like. + + cd /Users/fangjun/open-source/ + + git clone https://github.com/k2-fsa/sherpa-onnx + +Then start DevEco Studio and follow the screenshots below: + + +.. figure:: ./pic/tts/1-open.jpg + :alt: Screenshot of starting DevEco + :width: 600 + + Step 1: Click Open + +.. figure:: ./pic/vad-asr/2-select-and-open.jpg + :alt: Screenshot of selecting SherpaOnnxVadAsr to open + :width: 600 + + Step 2: Select SherpaOnnxVadAsr inside the harmony-os folder and click Open + +.. figure:: ./pic/vad-asr/3-check-version.jpg + :alt: Screenshot of check version + :width: 600 + + Step 3: Check that it is using the latest version. You can visit `sherpa_onnx `_ to check available versions. + +Download a VAD model +-------------------- + +The first thing we have to do is to download the VAD model and put it inside +the directory `rawfile `_. + +``Caution``: The model MUST be placed inside the directory `rawfile `_. + +.. code-block:: bash + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxVadAsr/entry/src/main/resources/rawfile + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +Select a non-streaming ASR model +-------------------------------- + +The code supports many non-streaming models from + + ``_ + +and we have to modify the code to use the model that we choose. + +.. hint:: + + You can try the above models at the following huggingface space: + + ``_ + +We give two examples below about how to use the following two models: + + - :ref:`sherpa-onnx-moonshine-tiny-en-int8` + - :ref:`sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17` + +Use sherpa-onnx-moonshine-tiny-en-int8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +First, we download and unzip the model. + +``Caution``: The model MUST be placed inside the directory `rawfile `_. + +.. code-block:: bash + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxVadAsr/entry/src/main/resources/rawfile + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + + # Remove unused files + rm -rf sherpa-onnx-moonshine-tiny-en-int8/test_wavs + +Please check that your directory looks ``exactly`` like the following at this point: + +.. code-block:: + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxVadAsr/entry/src/main/resources/rawfile + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls -lh + total 3536 + drwxr-xr-x 9 fangjun staff 288B Dec 6 15:42 sherpa-onnx-moonshine-tiny-en-int8 + -rw-r--r-- 1 fangjun staff 1.7M Nov 28 18:13 silero_vad.onnx + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ tree . + . + ├── sherpa-onnx-moonshine-tiny-en-int8 + │   ├── LICENSE + │   ├── README.md + │   ├── cached_decode.int8.onnx + │   ├── encode.int8.onnx + │   ├── preprocess.onnx + │   ├── tokens.txt + │   └── uncached_decode.int8.onnx + └── silero_vad.onnx + + 1 directory, 8 files + +Now you should see the following inside DevEco Studio: + +.. figure:: ./pic/vad-asr/4-check-dir-moonshine.jpg + :alt: Screenshot of sherpa-onnx-moonshine-tiny-en-int8 inside rawfile + :width: 600 + + Step 4: Check the model directory inside the ``rawfile`` directory. + +Now it is time to modify the code to use our model. + +We need to change `NonStreamingAsrWithVadWorker.ets `_. + +.. figure:: ./pic/vad-asr/5-change-code-for-moonshine.jpg + :alt: Screenshot of changing code for moonshine + :width: 600 + + Step 5: Change the code to use our selected model + +Finally, we can build the project. See the screenshot below: + +.. figure:: ./pic/vad-asr/6-build-moonshine.jpg + :alt: Screenshot of changing code for moonshine + :width: 600 + + Step 6: Build the project + +If you have an emulator, you can now start it. + +.. figure:: ./pic/vad-asr/7-select-device-manager.jpg + :alt: Screenshot of selecting device manager + :width: 600 + + Step 7: Select the device manager + + +.. figure:: ./pic/vad-asr/8-start-emulator-for-moonshine.jpg + :alt: Screenshot of starting the emulator + :width: 600 + + Step 8: Start the emulator + +After the emulator is started, follow the screenshot below to run the app on the +emulator: + +.. figure:: ./pic/vad-asr/9-start-app-emulator.jpg + :alt: Screenshot of starting the app on the emulator + :width: 600 + + Step 9: Start the app on the emulator + +You should see something like below: + +.. figure:: ./pic/vad-asr/10-allow-mic-moonshine.jpg + :alt: Screenshot of app running on the emulator + :width: 600 + + Step 10: Click Allow to allow the app to access the microphone + +.. figure:: ./pic/vad-asr/11-select-file-moonshine.jpg + :alt: Screenshot of selecting a file for recognition + :width: 600 + + Step 11: Select a .wav file for recognition + +.. figure:: ./pic/vad-asr/12-start-mic-moonshine.jpg + :alt: Screenshot of starting the microphone + :width: 600 + + Step 12: Start the microphone to record speech for recognition + +Congratulations! + +You have successfully run a on-device non-streaming speech recognition APP on HarmonyOS! + +Use sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +First, we download and unzip the model. + +``Caution``: The model MUST be placed inside the directory `rawfile `_. + +.. code-block:: bash + + cd /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxVadAsr/entry/src/main/resources/rawfile + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + # Remove unused files + rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx + +Please check that your directory looks ``exactly`` like the following at this point: + +.. code-block:: bash + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/harmony-os/SherpaOnnxVadAsr/entry/src/main/resources/rawfile + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls + sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 silero_vad.onnx + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/ + total 493616 + -rw-r--r-- 1 fangjun staff 71B Jul 18 21:06 LICENSE + -rw-r--r-- 1 fangjun staff 104B Jul 18 21:06 README.md + -rwxr-xr-x 1 fangjun staff 5.8K Jul 18 21:06 export-onnx.py + -rw-r--r-- 1 fangjun staff 228M Jul 18 21:06 model.int8.onnx + -rw-r--r-- 1 fangjun staff 308K Jul 18 21:06 tokens.txt + + (py38) fangjuns-MacBook-Pro:rawfile fangjun$ tree . + . + ├── sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + │   ├── LICENSE + │   ├── README.md + │   ├── export-onnx.py + │   ├── model.int8.onnx + │   └── tokens.txt + └── silero_vad.onnx + + 1 directory, 6 files + +Now you should see the following inside DevEco Studio: + +.. figure:: ./pic/vad-asr/4-check-dir-sense-voice.jpg + :alt: Screenshot of sense voice inside rawfile + :width: 600 + + Step 4: Check the model directory inside the ``rawfile`` directory. + +Now it is time to modify the code to use our model. + +We need to change `NonStreamingAsrWithVadWorker.ets `_. + +.. figure:: ./pic/vad-asr/5-change-code-for-sense-voice.jpg + :alt: Screenshot of changing code for sense voice + :width: 600 + + Step 5-1: Change the code to use our selected model + +.. figure:: ./pic/vad-asr/5-change-code-for-sense-voice-2.jpg + :alt: Screenshot of changing code for sense voice + :width: 600 + + Step 5-2: Change the code to use our selected model + +Finally, we can build the project. See the screenshot below: + +.. figure:: ./pic/vad-asr/6-build-moonshine.jpg + :alt: Screenshot of changing code for moonshine + :width: 600 + + Step 6: Build the project + +If you have an emulator, you can now start it. + +.. figure:: ./pic/vad-asr/7-select-device-manager-sense-voice.jpg + :alt: Screenshot of selecting device manager + :width: 600 + + Step 7: Select the device manager + + +.. figure:: ./pic/vad-asr/8-start-emulator-for-moonshine.jpg + :alt: Screenshot of starting the emulator + :width: 600 + + Step 8: Start the emulator + +After the emulator is started, follow the screenshot below to run the app on the +emulator: + +.. figure:: ./pic/vad-asr/9-start-app-sense-voice.jpg + :alt: Screenshot of starting the app on the emulator + :width: 600 + + Step 9: Start the app on the emulator + +.. figure:: ./pic/vad-asr/10-click-allow-mic-sense-voice.jpg + :alt: Screenshot of app running on the emulator + :width: 600 + + Step 10: Click Allow to allow the app accessing the microphone + +.. figure:: ./pic/vad-asr/11-select-a-file-sense-voice.jpg + :alt: Screenshot of selecting a file for recognition + :width: 600 + + Step 11: Select a .wav file for recognition + +.. figure:: ./pic/vad-asr/12-start-mic-sense-voice.jpg + :alt: Screenshot of starting the microphone + :width: 600 + + Step 12: Start the microphone to record speech for recognition + +Congratulations! + +You have successfully run a on-device non-streaming speech recognition APP on HarmonyOS! diff --git a/docs/source/onnx/hotwords/index.rst b/docs/source/onnx/hotwords/index.rst new file mode 100644 index 000000000..a4cb41220 --- /dev/null +++ b/docs/source/onnx/hotwords/index.rst @@ -0,0 +1,900 @@ +.. _sherpa-onnx-hotwords: + +Hotwords (Contextual biasing) +============================= + +In this section, we describe how we implement the hotwords (aka contextual biasing) +feature with an Aho-corasick automaton and how to use it in `sherpa-onnx`_. + +.. caution:: + + Only transducer models support hotwords in `sherpa-onnx`_. + That is, only models from :ref:`sherpa-onnx-offline-transducer-models` + and :ref:`onnx_online_transducer_models` support hotwords. + + All other models don't support hotwords. + + + Also, you have to change the decoding method to ``modified_beam_search`` + to use hotwords. The default decoding method ``greedy_search`` does not + support hotwords. + +What are hotwords +----------------- + +Current ASR systems work very well for general cases, but they sometimes fail to +recognize special words/phrases (aka hotwords) like rare words, personalized +information etc. Usually, those words/phrases will be recognized as the words/phrases +that pronounce similar to them (for example, recognize ``LOUIS FOURTEEN`` as ``LEWIS FOURTEEN``). +So we have to provide some kind of contexts information (for example, the ``LOUIS FOURTEEN``) +to the ASR systems to boost those words/phrases. Normally, we call this kind of +boosting task contextual biasing (aka hotwords recognition). + + +How do we implement it with an Aho-corasick +------------------------------------------- + +We first construct an Aho-corasick automaton on those given hotwords (after tokenizing +into tokens). Please refer to ``_ +for the construction details of Aho-corasick. + +The figure below is the aho-corasick on "HE/SHE/SHELL/HIS/THIS" with ``hotwords-score==1``. + + .. figure:: ./pic/context_graph.png + :alt: The aho-corasick for "HE/SHE/SHELL/HIS/THIS" + :width: 600 + + The Aho-corasick for "HE/SHE/SHELL/HIS/THIS" + +The ``black`` arrows in the graph are the goto arcs, the ``red`` arrows are the +failure arcs, the ``green`` arrows are the output arcs. On each goto arc, there +are token and boosting score (**Note: we will boost the path when any partial +sequence is matched, if the path finally fails to full match any hotwords, the boosted +score will be canceled**). Currentlly, the boosting score distributes on the arcs +evenly along the path. On each state, there are two scores, the first one is the +node score (mainly used to cancel score) the second one is output score, the output +score is the total scores of the full matched hotwords of this state. + +The following are several matching examples of the graph above. + +.. note:: + + For simplicity, we assume that the system emits a token each frame. + +.. hint:: + + We have an extra ``finalize`` step to force the graph state to go back to + the root state. + +**The path is "SHELF"** + +.. list-table:: + + * - Frame + - Boost score + - Total boost score + - Graph state + - Matched hotwords + * - init + - 0 + - 0 + - 0 + - + * - 1 + - 1 + - 1 + - 3 + - + * - 2 + - 1 + - 2 + - 4 + - + * - 3 + - 1 + 5 + - 8 + - 5 + - HE, SHE + * - 4 + - 1 + - 9 + - 6 + - + * - 5 + - -4 + - 5 + - 0 + - + * - finalize + - 0 + - 5 + - 0 + - + +At ``frame 3`` we reach ``state 5`` and match ``HE, SHE``, so we get a boosting +score ``1 + 5``, the score ``1`` here because the ``SHEL`` still might be the prefix +of other hotwords. +At ``frame 5`` ``F`` can not match any tokens and fail back to root, so we cancel +the score for ``SHEL`` which is ``4`` (the node score of ``state 6``). + + +**The path is "HI"** + +.. list-table:: + + * - Frame + - Boost score + - Total boost score + - Graph state + - Matched hotwords + * - init + - 0 + - 0 + - 0 + - + * - 1 + - 1 + - 1 + - 1 + - + * - 2 + - 1 + - 2 + - 8 + - + * - finalize + - -2 + - 0 + - 0 + - + +``H`` and ``I`` all match the tokens in the graph, unfortunately, we have to go +back to root state when finishing matching a path, so we cancel the boosting score +of ``HI`` which is ``2`` (the node score of ``state 8``). + + +**The path is "THE"** + +.. list-table:: + + * - Frame + - Boost score + - Total boost score + - Graph state + - Matched hotwords + * - init + - 0 + - 0 + - 0 + - + * - 1 + - 1 + - 1 + - 10 + - + * - 2 + - 1 + - 2 + - 11 + - + * - 3 + - 0 + 2 + - 4 + - 2 + - HE + * - finalize + - -2 + - 3 + - 0 + - + +At ``frame 3`` we jump from ``state 11`` to ``state 2`` and get a boosting score +of ``0 + 2``, ``0`` because the node score of ``state 2`` is the same as ``state 11`` +so we don't get score by partial match (the prefix of ``state 11`` is ``TH`` has +the same length of the prefix of ``state 2`` which is ``HE``), but we do get the +output score (at ``state 2`` it outputs ``HE``). + + +.. note:: + + We implement the hotwords feature during inference time, you don't have to + re-train the models to use this feature. + + +How to use hotwords in sherpa-onnx +---------------------------------- + +.. caution:: + + Currentlly, the hotwords feature is only supported in the + ``modified_beam_search`` decoding method of the **transducer models** + (both streaming and non-streaming). + +The use of the hotwords is no different for streaming and non-streaming models, +and in fact it is even no different for all the API supported by sherpa onnx. +We add **FOUR** extra arguments for hotwords: + + - ``hotwords-file`` + + The file path of the hotwords, one hotwords per line. They could be Chinese + words, English words or both according to the modeling units used to train + the model. Here are some examples: + + For Chinese models trained on ``cjkchar`` it looks like: + + .. code-block:: + + 语音识别 + 深度学习 + + For English like language models trained on ``bpe`` it looks like: + + .. code-block:: + + SPEECH RECOGNITION + DEEP LEARNING + + For multilingual models trained on ``cjkchar+bpe`` (Chinese + English) it looks like: + + .. code-block:: + + SPEECH 识别 + SPEECH RECOGNITION + 深度学习 + + You can also specify the boosting score for each hotword, the score should follow the + predefined character `:`, for example: + + .. code-block:: + + 语音识别 :3.5 + 深度学习 :2.0 + + It means, hotword `语音识别` will have a boosting score of 3.5, hotword `深度学习` will have a boosting score of 2.0. + For those hotwords that don't have specific scores, they will use the global score provided by `hotword-score` below. + + .. caution:: + + The specific score MUST BE the last item of each hotword (i.e You shouldn't break the hotword into two parts by the score). + SPEECH :2.0 识别 # This is invalid + + - ``hotwords-score`` + + The boosting score for each matched token. + + .. note:: + + We match the hotwords at token level, so the ``hotwords-score`` is applied + at token level. + + - ``modeling-unit`` + + The modeling unit of the used model, currently support `cjkchar` (for Chinese), `bpe` (for English like languages) + and `cjkchar+bpe` (for multilingual models). We need this modeling-unit to select tokenizer to encode words/phrases + into tokens, so **do provide correct modeling-unit according to your model**. + + - ``bpe-vocab`` + + The bpe vocabulary generated by sentencepiece toolkit, it also can be exported from `bpe.model` (see `script/export_bpe_vocab.py` for details). + This vocabulary is used to tokenize words/phrases into bpe units. It is only used when `modeling-unit` is `bpe` or `cjkchar+bpe`. + + .. hint:: + + We need `bpe.vocab` rather than `bpe.model`, because we don't introduce sentencepiece c++ codebase into `sherpa-onnx` (which + has a depandancy issue of protobuf) , we implement a simple sentencepiece encoder and decoder which takes `bpe.vocab` as input. + + +The main difference of using hotwords feature is about the modeling units. +The following shows how to use it for different modeling units. + +.. hint:: + + You can use any transducer models here ``_, + we just choose three of them randomly for the following examples. + + +.. note:: + + In the following example, we use a non-streaming model, if you are using a + streaming model, you should use ``sherpa-onnx``. ``sherpa-onnx-alsa``, + ``sherpa-onnx-microphone``, ``sherpa-onnx-microphone-offline``, + ``sherpa-onnx-online-websocket-server`` and ``sherpa-onnx-offline-websocket-server`` + all support hotwords. + + +Modeling unit is bpe +^^^^^^^^^^^^^^^^^^^^ + +**Download the model** + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + ln -s sherpa-onnx-zipformer-en-2023-04-01 exp + +Export bpe.vocab if you can't find bpe.vocab in the model directory. + +.. code-block:: + + python script/export_bpe_vocab.py --bpe-model exp/bpe.model + + +The ``hotwords_en.txt`` contains: + +.. code-block:: + + QUARTERS + FOREVER + +C++ api +******* + +**Decoding without hotwords** + +.. code-block:: + + ./build/bin/sherpa-onnx-offline \ + --encoder=exp/encoder-epoch-99-avg-1.onnx \ + --decoder=exp/decoder-epoch-99-avg-1.onnx \ + --joiner=exp/joiner-epoch-99-avg-1.onnx \ + --decoding-method=modified_beam_search \ + --tokens=exp/tokens.txt \ + exp/test_wavs/0.wav exp/test_wavs/1.wav + +The output is: + +.. code-block:: + + /star-kw/kangwei/code/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --encoder=exp/encoder-epoch-99-avg-1.onnx --decoder=exp/decoder-epoch-99-avg-1.onnx --joiner=exp/joiner-epoch-99-avg-1.onnx --decoding-method=modified_beam_search --tokens=exp/tokens.txt exp/test_wavs/0.wav exp/test_wavs/1.wav + + OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTran$ducerModelConfig(encoder_filename="exp/encoder-epoch-99-avg-1.onnx", decoder_filename="exp/decoder-epoch-99-avg-1.onnx", joiner_filename="exp/joiner-epoch-99-$vg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder=$", decoder="", language="", task="transcribe"), tdnn=OfflineTdnnModelConfig(model=""), tokens="exp/tokens.txt", num_threads$2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="modified_beam_search", max_active_paths=4, ho$words_file=, hotwords_score=1.5) + Creating recognizer ... + Started + Done! + + exp/test_wavs/0.wav + {"text":"ALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[1.44, 1.48, 1.56, 1.72, 1.88, 1.96, 2.16, 2.28$ 2.36, 2.48, 2.60, 2.80, 3.08, 3.28, 3.40, 3.60, 3.80, 4.08, 4.24, 4.32, 4.48, 4.64, 4.84, 4.88, 5.00, 5.08, 5.32, 5.48, 5.60, 5.68, 5.84, 6.04, 6.24]","token$":["A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF","THE"," B","RO","TH","EL","S"]} + ---- + exp/test_wavs/1.wav + {"text":"IN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AN + D DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[2.44, 2.64, 2.88, 3.16, 3.28, 3.48, 3.60, 3.80, 3.96, 4.12, 4.36, 4.52, 4.72, 4 + .92, 5.16, 5.44, 5.68, 6.04, 6.24, 6.48, 6.84, 7.08, 7.32, 7.56, 7.84, 8.12, 8.24, 8.32, 8.44, 8.60, 8.76, 8.88, 9.08, 9.28, 9.44, 9.56, 9.64, 9.76, 9.96, 10.0 + 4, 10.20, 10.40, 10.64, 10.76, 11.04, 11.20, 11.36, 11.60, 11.80, 12.00, 12.12, 12.28, 12.32, 12.52, 12.72, 12.84, 12.96, 13.04, 13.24, 13.40, 13.60, 13.76, 13 + .96, 14.12, 14.24, 14.36, 14.52, 14.68, 14.76, 15.04, 15.28, 15.52, 15.76, 16.00, 16.16, 16.24, 16.32]","tokens":["IN"," WHICH"," MAN"," TH","US"," P","UN","IS + H","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO", + " CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," B + E"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} + ---- + num threads: 2 + decoding method: modified_beam_search + max active paths: 4 + Elapsed seconds: 1.775 s + Real time factor (RTF): 1.775 / 23.340 = 0.076 + + +**Decoding with hotwords** + +.. code-block:: + + ./build/bin/sherpa-onnx-offline \ + --encoder=exp/encoder-epoch-99-avg-1.onnx \ + --decoder=exp/decoder-epoch-99-avg-1.onnx \ + --joiner=exp/joiner-epoch-99-avg-1.onnx \ + --decoding-method=modified_beam_search \ + --tokens=exp/tokens.txt \ + --modeling-unit=bpe \ + --bpe-vocab=exp/bpe.vocab \ + --hotwords-file=hotwords_en.txt \ + --hotwords-score=2.0 \ + exp/test_wavs/0.wav exp/test_wavs/1.wav + +The output is: + +.. code-block:: + + /star-kw/kangwei/code/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --encoder=exp/encoder-epoch-99-avg-1.onnx --decoder=exp/decoder-epoch-99-avg-1.onnx --joiner=exp/joiner-epoch-99-avg-1.onnx --decoding-method=modified_beam_search --tokens=exp/tokens.txt --hotwords-file=hotwords_en.txt --hotwords-score=2.0 exp/test_wavs/0.wav exp/test_wavs/1.wav + + OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="exp/encoder-epoch-99-avg-1.onnx", decoder_filename="exp/decoder-epoch-99-avg-1.onnx", joiner_filename="exp/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder=" + ", decoder="", language="", task="transcribe"), tdnn=OfflineTdnnModelConfig(model=""), tokens="exp/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="modified_beam_search", max_active_paths=4, hotwords_file=hotwords_en.txt, hotwords_score=2) + Creating recognizer ... + Started + Done! + + exp/test_wavs/0.wav + {"text":"ALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTERS OF THE BROTHELS","timestamps":"[1.44, 1.48, 1.56, 1.72, 1.88, 1.96, 2.16, 2.28 + , 2.36, 2.48, 2.60, 2.80, 3.08, 3.28, 3.40, 3.60, 3.80, 4.08, 4.24, 4.32, 4.48, 4.64, 4.84, 4.88, 5.00, 5.08, 5.12, 5.36, 5.48, 5.60, 5.68, 5.84, 6.04, 6.24]", + "tokens":["A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER",$ + S"," OF"," THE"," B","RO","TH","EL","S"]} + ---- + exp/test_wavs/1.wav + {"text":"IN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AN$ + DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[2.44, 2.64, 2.88, 3.16, 3.28, 3.48, 3.60, 3.80, 3.96, 4.12, 4.36, 4.52, 4.72, 4$ + 92, 5.16, 5.44, 5.68, 6.04, 6.24, 6.48, 6.84, 7.08, 7.32, 7.56, 7.84, 8.12, 8.24, 8.32, 8.44, 8.60, 8.76, 8.88, 9.08, 9.28, 9.44, 9.56, 9.64, 9.76, 9.96, 10.0$ + , 10.20, 10.40, 10.68, 10.76, 11.04, 11.20, 11.36, 11.60, 11.80, 12.00, 12.12, 12.28, 12.32, 12.52, 12.72, 12.84, 12.96, 13.04, 13.24, 13.40, 13.60, 13.76, 13$ + 96, 14.12, 14.24, 14.36, 14.52, 14.68, 14.76, 15.04, 15.28, 15.52, 15.76, 16.00, 16.16, 16.24, 16.32]","tokens":["IN"," WHICH"," MAN"," TH","US"," P","UN","IS$ + ","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO",$ + CON","NE","C","T"," HER"," P","AR","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE$ + ," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} + ---- + num threads: 2 + decoding method: modified_beam_search + max active paths: 4 + Elapsed seconds: 1.522 s + Real time factor (RTF): 1.522 / 23.340 = 0.065 + +.. hint:: + + ``QUARTER`` -> ``QUARTERS`` + + ``FOR EVER`` -> ``FOREVER`` + + +Python api +********** + +**Decoding without hotwords** + +.. code-block:: + + python python-api-examples/offline-decode-files.py \ + --encoder exp/encoder-epoch-99-avg-1.onnx \ + --decoder exp/decoder-epoch-99-avg-1.onnx \ + --joiner exp/joiner-epoch-99-avg-1.onnx \ + --decoding modified_beam_search \ + --tokens exp/tokens.txt \ + exp/test_wavs/0.wav exp/test_wavs/1.wav + +The output is: + +.. code-block:: + + Started! + Done! + exp/test_wavs/0.wav + ALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + ---------- + exp/test_wavs/1.wav + IN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + ---------- + num_threads: 1 + decoding_method: modified_beam_search + Wave duration: 23.340 s + Elapsed time: 2.546 s + Real time factor (RTF): 2.546/23.340 = 0.109 + + +**Decoding with hotwords** + +.. code-block:: + + python python-api-examples/offline-decode-files.py \ + --encoder exp/encoder-epoch-99-avg-1.onnx \ + --decoder exp/decoder-epoch-99-avg-1.onnx \ + --joiner exp/joiner-epoch-99-avg-1.onnx \ + --decoding modified_beam_search \ + --tokens exp/tokens.txt \ + --modeling-unit bpe \ + --bpe-vocab exp/bpe.vocab \ + --hotwords-file hotwords_en.txt \ + --hotwords-score 2.0 \ + exp/test_wavs/0.wav exp/test_wavs/1.wav + +The output is: + +.. code-block:: + + Started! + Done! + exp/test_wavs/0.wav + ALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTERS OF THE BROTHELS + ---------- + exp/test_wavs/1.wav + IN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENTOF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + ---------- + num_threads: 1 + decoding_method: modified_beam_search + Wave duration: 23.340 s + Elapsed time: 2.463 s + Real time factor (RTF): 2.463/23.340 = 0.106 + +.. hint:: + + ``QUARTER`` -> ``QUARTERS`` + + ``FOR EVER`` -> ``FOREVER`` + + +Modeling unit is cjkchar +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Download the model** + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-conformer-zh-stateless2-2023-05-23.tar.bz2 + tar xvf sherpa-onnx-conformer-zh-stateless2-2023-05-23.tar.bz2 + rm sherpa-onnx-conformer-zh-stateless2-2023-05-23.tar.bz2 + + ln -s sherpa-onnx-conformer-zh-stateless2-2023-05-23 exp-zh + +The ``hotwords_cn.txt`` contains: + +.. code-block:: + + 文森特卡索 + 周望君 + 朱丽楠 + 蒋有伯 + +C++ api +******* + +**Decoding without hotwords** + +.. code-block:: + + ./build/bin/sherpa-onnx-offline \ + --encoder=exp-zh/encoder-epoch-99-avg-1.onnx \ + --decoder=exp-zh/decoder-epoch-99-avg-1.onnx \ + --joiner=exp-zh/joiner-epoch-99-avg-1.onnx \ + --tokens=exp-zh/tokens.txt \ + --decoding-method=modified_beam_search \ + exp-zh/test_wavs/3.wav exp-zh/test_wavs/4.wav exp-zh/test_wavs/5.wav exp-zh/test_wavs/6.wav + +The output is: + +.. code-block:: + + /star-kw/kangwei/code/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --encoder=exp-zh/encoder-epoch-99-avg-1.onnx --decoder=exp-zh/decoder-epoch-99-avg-1.onnx --joiner=exp-zh/joiner-epoch-99-avg-1.onnx --tokens=exp-zh/tokens.txt --decoding-method=modified_beam_search exp-zh/test_wavs/3.wav exp-zh/test_wavs/4.wav exp-zh/test_wavs/5.wav exp-zh/test_wavs/6.wav + + OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="exp-zh/encoder-epoch-99-avg-1.onnx", decoder_filename="exp-zh/decoder-epoch-99-avg-1.onnx", joiner_filename="exp-zh/joiner-$poch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig$encoder="", decoder="", language="", task="transcribe"), tdnn=OfflineTdnnModelConfig(model=""), tokens="exp-zh/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="modified_beam_search", max_active$paths=4, hotwords_file=, hotwords_score=1.5) + Creating recognizer ... + Started + Done! + + exp-zh/test_wavs/3.wav + {"text":"文森特卡所是全球知名的法国性格派演员","timestamps":"[0.00, 0.16, 0.68, 1.32, 1.72, 2.08, 2.60, 2.88, 3.20, 3.52, 3.92, 4.40, 4.68, 5.12, 5.44, 6.36, $.96, 7.32]","tokens":["文","森","特","卡","所","是","全","球","知","名","的","法","国","性","格","派","演","员"]} + ---- + exp-zh/test_wavs/4.wav + {"text":"蒋友伯被拍到带着女儿出游","timestamps":"[0.00, 0.20, 0.88, 1.36, 1.76, 2.08, 2.28, 2.68, 2.92, 3.16, 3.44, 3.80]","tokens":["蒋","友","伯","被","拍",$ + 到","带","着","女","儿","出","游"]} + ---- + exp-zh/test_wavs/5.wav + {"text":"周望军就落实控物价","timestamps":"[0.00, 0.16, 0.88, 1.24, 1.64, 1.96, 2.76, 3.04, 3.32]","tokens":["周","望","军","就","落","实","控","物","价"]} + ---- + exp-zh/test_wavs/6.wav + {"text":"朱立南在上市见面会上表示","timestamps":"[0.00, 0.16, 0.80, 1.12, 1.44, 1.68, 1.92, 2.16, 2.36, 2.60, 2.84, 3.12]","tokens":["朱","立","南","在","上",$ + 市","见","面","会","上","表","示"]} + ---- + num threads: 2 + decoding method: modified_beam_search + max active paths: 4 + Elapsed seconds: 1.883 s + Real time factor (RTF): 1.883 / 20.328 = 0.093 + + +**Decoding with hotwords** + +.. code-block:: + + ./build/bin/sherpa-onnx-offline \ + --encoder=exp-zh/encoder-epoch-99-avg-1.onnx \ + --decoder=exp-zh/decoder-epoch-99-avg-1.onnx \ + --joiner=exp-zh/joiner-epoch-99-avg-1.onnx \ + --tokens=exp-zh/tokens.txt \ + --decoding-method=modified_beam_search \ + --modeling-unit=cjkchar \ + --hotwords-file=hotwords_cn.txt \ + --hotwords-score=2.0 \ + exp-zh/test_wavs/3.wav exp-zh/test_wavs/4.wav exp-zh/test_wavs/5.wav exp-zh/test_wavs/6.wav + + OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="exp-zh/encoder-epoch-99-avg-1.onnx", decoder_filename="exp-zh/decoder-epoch-99-avg-1.onnx", joiner_filename="exp-zh/joiner-$poch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig$encoder="", decoder="", language="", task="transcribe"), tdnn=OfflineTdnnModelConfig(model=""), tokens="exp-zh/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="modified_beam_search", max_active$paths=4, hotwords_file=hotwords_cn.txt, hotwords_score=2) + Creating recognizer ... + Started + Done! + + exp-zh/test_wavs/3.wav + {"text":"文森特卡索是全球知名的法国性格派演员","timestamps":"[0.00, 0.16, 0.64, 1.28, 1.64, 2.04, 2.60, 2.88, 3.20, 3.52, 3.92, 4.40, 4.68, 5.12, 5.44, 6.36, $.96, 7.32]","tokens":["文","森","特","卡","索","是","全","球","知","名","的","法","国","性","格","派","演","员"]} + ---- + exp-zh/test_wavs/4.wav + {"text":"蒋有伯被拍到带着女儿出游","timestamps":"[0.00, 0.12, 0.80, 1.36, 1.76, 2.08, 2.28, 2.68, 2.92, 3.16, 3.44, 3.80]","tokens":["蒋","有","伯","被","拍",$ + 到","带","着","女","儿","出","游"]} + ---- + exp-zh/test_wavs/5.wav + {"text":"周望君就落实空物价","timestamps":"[0.00, 0.12, 0.80, 1.24, 1.56, 1.96, 2.68, 3.08, 3.32]","tokens":["周","望","君","就","落","实","空","物","价"]} + ---- + exp-zh/test_wavs/6.wav + {"text":"朱丽楠在上市见面会上表示","timestamps":"[0.00, 0.12, 0.80, 1.12, 1.44, 1.68, 1.92, 2.16, 2.36, 2.60, 2.84, 3.12]","tokens":["朱","丽","楠","在","上",$ + 市","见","面","会","上","表","示"]} + ---- + num threads: 2 + decoding method: modified_beam_search + max active paths: 4 + Elapsed seconds: 1.810 s + Real time factor (RTF): 1.810 / 20.328 = 0.089 + +.. hint:: + + ``文森特卡所`` -> ``文森特卡索`` + + ``周望军`` -> ``周望君`` + + ``朱立南`` -> ``朱丽楠`` + + ``蒋友伯`` -> ``蒋有伯`` + + +Python api +********** + +**Decoding without hotwords** + +.. code-block:: + + python python-api-examples/offline-decode-files.py \ + --encoder exp-zh/encoder-epoch-99-avg-1.onnx \ + --decoder exp-zh/decoder-epoch-99-avg-1.onnx \ + --joiner exp-zh/joiner-epoch-99-avg-1.onnx \ + --tokens exp-zh/tokens.txt \ + --decoding-method modified_beam_search \ + exp-zh/test_wavs/3.wav exp-zh/test_wavs/4.wav exp-zh/test_wavs/5.wav exp-zh/test_wavs/6.wav + +The output is: + +.. code-block:: + + Started! + Done! + exp-zh/test_wavs/3.wav + 文森特卡所是全球知名的法国性格派演员 + ---------- + exp-zh/test_wavs/4.wav + 蒋友伯被拍到带着女儿出游 + ---------- + exp-zh/test_wavs/5.wav + 周望军就落实控物价 + ---------- + exp-zh/test_wavs/6.wav + 朱立南在上市见面会上表示 + ---------- + num_threads: 1 + decoding_method: modified_beam_search + Wave duration: 20.328 s + Elapsed time: 2.653 s + Real time factor (RTF): 2.653/20.328 = 0.131 + + +**Decoding with hotwords** + +.. code-block:: + + python python-api-examples/offline-decode-files.py \ + --encoder exp-zh/encoder-epoch-99-avg-1.onnx \ + --decoder exp-zh/decoder-epoch-99-avg-1.onnx \ + --joiner exp-zh/joiner-epoch-99-avg-1.onnx \ + --tokens exp-zh/tokens.txt \ + --decoding-method modified_beam_search \ + --modeling-unit=cjkchar \ + --hotwords-file hotwords_cn.txt \ + --hotwords-score 2.0 \ + exp-zh/test_wavs/3.wav exp-zh/test_wavs/4.wav exp-zh/test_wavs/5.wav exp-zh/test_wavs/6.wav + +The output is: + +.. code-block:: + + Started! + Done! + exp-zh/test_wavs/3.wav + 文森特卡索是全球知名的法国性格派演员 + ---------- + exp-zh/test_wavs/4.wav + 蒋有伯被拍到带着女儿出游 + ---------- + exp-zh/test_wavs/5.wav + 周望君就落实空物价 + ---------- + exp-zh/test_wavs/6.wav + 朱丽楠在上市见面会上表示 + ---------- + num_threads: 1 + decoding_method: modified_beam_search + Wave duration: 20.328 s + Elapsed time: 2.636 s + Real time factor (RTF): 2.636/20.328 = 0.130 + + +.. hint:: + + ``文森特卡所`` -> ``文森特卡索`` + + ``周望军`` -> ``周望君`` + + ``朱立南`` -> ``朱丽楠`` + + ``蒋友伯`` -> ``蒋有伯`` + + +Modeling unit is cjkchar+bpe +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Download the model** + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + ln -s sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 exp-mixed + + +Export bpe.vocab if you can't find bpe.vocab in the model directory. + +.. code-block:: + + python script/export_bpe_vocab.py --bpe-model exp/bpe.model + +The ``hotwords_mix.txt`` contains: + +.. code-block:: + + 礼拜二 + 频繁 + +C++ api +******* + +**Decoding without hotwords** + +.. code-block:: + + ./build/bin/sherpa-onnx \ + --encoder=exp-mixed/encoder-epoch-99-avg-1.onnx \ + --decoder=exp-mixed/decoder-epoch-99-avg-1.onnx \ + --joiner=exp-mixed/joiner-epoch-99-avg-1.onnx \ + --decoding-method=modified_beam_search \ + --tokens=exp-mixed/tokens.txt \ + exp-mixed/test_wavs/0.wav exp-mixed/test_wavs/2.wav + +The output is: + +.. code-block:: + + /star-kw/kangwei/code/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --encoder=exp-mixed/encoder-epoch-99-avg-1.onnx --decoder=exp-mixed/decoder-epoch-99-avg-1.onnx --joiner=exp-mixed/joiner-epoch-99-avg-1.onnx --decoding-method=modified_beam_search --tokens=exp-mixed/tokens.txt exp-mixed/test_wavs/0.wav exp-mixed/test_wavs/2.wav + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="exp-mixed/encoder-epoch-99-avg-1.onnx", decoder="exp-mixed/decoder-epoch-99-avg-1.onnx", joiner="exp-mixed/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), tokens="exp-mixed/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="modified_beam_search") + + exp-mixed/test_wavs/0.wav + Elapsed seconds: 3, Real time factor (RTF): 0.3 + 昨天是 MONDAY TODAY IS LIBR THE DAY AFTER TOMORROW是星期三 + {"is_final":false,"segment":0,"start_time":0.0,"text":"昨天是 MONDAY TODAY IS LIBR THE DAY AFTER TOMORROW是星期三","timestamps":"[0.64, 1.04, 1.60, 2.08, 2.20, 2.40, 4.16, 4.40, 4.88, 5.56, 5.80, 6.16, 6.84, 7.12, 7.44, 8.04, 8.16, 8.24, 8.28, 9.04, 9.40, 9.64, 9.88]","tokens":["昨","天","是"," MO","N","DAY"," TO","DAY"," IS"," LI","B","R"," THE"," DAY"," AFTER"," TO","M","OR","ROW","是","星","期","三"]} + + exp-mixed/test_wavs/2.wav + Elapsed seconds: 1.7, Real time factor (RTF): 0.37 + 是不是平凡的啊不认识记下来 FREQUENTLY频繁的 + {"is_final":false,"segment":0,"start_time":0.0,"text":"是不是平凡的啊不认识记下来 FREQUENTLY频繁的","timestamps":"[0.00, 0.40, 0.52, 0.96, 1.08, 1.28, 1.48, 1.68, 1.84, 2.00, 2.24, 2.36, 2.52, 2.68, 2.92, 3.00, 3.12, 3.32, 3.64, 3.96, 4.36]","tokens":["是","不","是","平","凡","的","啊","不","认","识","记","下","来"," F","RE","QU","ENT","LY","频","繁","的"]} + + +**Decoding with hotwords** + +.. code-block:: bash + + ./build/bin/sherpa-onnx \ + --encoder=exp-mixed/encoder-epoch-99-avg-1.onnx \ + --decoder=exp-mixed/decoder-epoch-99-avg-1.onnx \ + --joiner=exp-mixed/joiner-epoch-99-avg-1.onnx \ + --decoding-method=modified_beam_search \ + --tokens=exp-mixed/tokens.txt \ + --modeling-unit=cjkchar+bpe \ + --bpe-vocab=exp/bpe.vocab \ + --hotwords-file=hotwords_mix.txt \ + --hotwords-score=2.0 \ + exp-mixed/test_wavs/0.wav exp-mixed/test_wavs/2.wav + +The output is: + +.. code-block:: + + /star-kw/kangwei/code/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --encoder=exp-mixed/encoder-epoch-99-avg-1.onnx --decoder=exp-mixed/decoder-epoch-99-avg-1.onnx --joiner=exp-mixed/joiner-epoch-99-avg-1.onnx --decoding-method=modified_beam_search --tokens=exp-mixed/tokens.txt --tokens-type=cjkchar+bpe --bpe-model=exp-mixed/bpe.model --hotwords-file=hotwords_mix.txt --hotwords-score=2.0 exp-mixed/test_wavs/0.wav exp-mixed/test_wavs/2.wav + + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="exp-mixed/encoder-epoch-99-avg-1.onnx", decoder="exp-mixed/decoder-epoch-99-avg-1.onnx", joiner="exp-mixed/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), tokens="exp-mixed/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=2, hotwords_file="hotwords_mix.txt", decoding_method="modified_beam_search") + + exp-mixed/test_wavs/0.wav + Elapsed seconds: 3.2, Real time factor (RTF): 0.32 + 昨天是 MONDAY TODAY IS礼拜二 THE DAY AFTER TOMORROW是星期三 + {"is_final":false,"segment":0,"start_time":0.0,"text":"昨天是 MONDAY TODAY IS礼拜二 THE DAY AFTER TOMORROW是星期三","timestamps":"[0.64, 1.04, 1.60, 2.08, 2.20, 2.40, 4.16, 4.40, 4.88, 5.56, 5.68, 6.00, 6.84, 7.12, 7.44, 8.04, 8.16, 8.24, 8.28, 9.04, 9.40, 9.64, 9.88]","tokens":["昨","天","是"," MO","N","DAY"," TO","DAY"," IS","礼","拜","二"," THE"," DAY"," AFTER"," TO","M","OR","ROW","是","星","期","三"]} + + exp-mixed/test_wavs/2.wav + Elapsed seconds: 1.9, Real time factor (RTF): 0.4 + 是不是频繁的啊不认识记下来 FREQUENTLY频繁的 + {"is_final":false,"segment":0,"start_time":0.0,"text":"是不是频繁的啊不认识记下来 FREQUENTLY频繁的","timestamps":"[0.00, 0.40, 0.52, 0.96, 1.08, 1.28, 1.48, 1.68, 1.84, 2.00, 2.24, 2.36, 2.52, 2.68, 2.92, 3.00, 3.12, 3.32, 3.64, 3.96, 4.36]","tokens":["是","不","是","频","繁","的","啊","不","认","识","记","下","来"," F","RE","QU","ENT","LY","频","繁","的"]} + + +.. hint:: + + ``LIBR`` -> ``礼拜二`` + + ``平凡`` -> ``频繁`` + + +Python api +********** + +**Decoding without hotwords** + +.. code-block:: + + python python-api-examples/online-decode-files.py \ + --encoderexp-mixed/encoder-epoch-99-avg-1.onnx \ + --decoder exp-mixed/decoder-epoch-99-avg-1.onnx \ + --joiner exp-mixed/joiner-epoch-99-avg-1.onnx \ + --decoding-method modified_beam_search \ + --tokens exp-mixed/tokens.txt + exp-mixed/test_wavs/0.wav exp-mixed/test_wavs/2.wav + +The output is: + +.. code-block:: + + Started! + Done! + exp-mixed/test_wavs/0.wav + 昨天是 MONDAY TODAY IS LIBR THE DAY AFTER TOMORROW是星期三 + ---------- + exp-mixed/test_wavs/2.wav + 是不是平凡的啊不认识记下来 FREQUENTLY频繁的 + ---------- + num_threads: 1 + decoding_method: modified_beam_search + Wave duration: 14.743 s + Elapsed time: 3.052 s + Real time factor (RTF): 3.052/14.743 = 0.207 + + +**Decoding with hotwords** + +.. code-block:: + + python python-api-examples/online-decode-files.py \ + --encoder exp-mixed/encoder-epoch-99-avg-1.onnx \ + --decoder exp-mixed/decoder-epoch-99-avg-1.onnx \ + --joiner exp-mixed/joiner-epoch-99-avg-1.onnx \ + --decoding-method modified_beam_search \ + --tokens exp-mixed/tokens.txt \ + --modeling-unit cjkchar+bpe \ + --bpe-vocab exp-mixed/bpe.vocab \ + --hotwords-file hotwords_mix.txt \ + --hotwords-score 2.0 \ + exp-mixed/test_wavs/0.wav exp-mixed/test_wavs/2.wav + +The output is: + +.. code-block:: + + Started! + Done! + exp-mixed/test_wavs/0.wav + 昨天是 MONDAY TODAY IS礼拜二 THE DAY AFTER TOMORROW是星期三 + ---------- + exp-mixed/test_wavs/2.wav + 是不是频繁的啊不认识记下来 FREQUENTLY频繁的 + ---------- + num_threads: 1 + decoding_method: modified_beam_search + Wave duration: 14.743 s + Elapsed time: 3.060 s + Real time factor (RTF): 3.060/14.743 = 0.208 + + +.. hint:: + + ``LIBR`` -> ``礼拜二`` + + ``平凡`` -> ``频繁`` diff --git a/docs/source/onnx/hotwords/pic/context_graph.png b/docs/source/onnx/hotwords/pic/context_graph.png new file mode 100644 index 000000000..7b00ab9f6 Binary files /dev/null and b/docs/source/onnx/hotwords/pic/context_graph.png differ diff --git a/docs/source/onnx/index.rst b/docs/source/onnx/index.rst new file mode 100644 index 000000000..f87ee8703 --- /dev/null +++ b/docs/source/onnx/index.rst @@ -0,0 +1,84 @@ +sherpa-onnx +=========== + +.. hint:: + + During speech recognition, it does not need to access the Internet. + Everyting is processed locally on your device. + +We support using `onnx`_ with `onnxruntime`_ to replace `PyTorch`_ for neural +network computation. The code is put in a separate repository `sherpa-onnx`_. + +`sherpa-onnx`_ is self-contained and everything can be compiled from source. + +Please refer to +``_ +for how to export models to `onnx`_ format. + +In the following, we describe how to build `sherpa-onnx`_ for Linux, macOS, +Windows, embedded systems, Android, and iOS. + +Also, we show how to use it for speech recognition with pre-trained models. + +.. toctree:: + :maxdepth: 5 + + ./tutorials/index + ./install/index + ./faqs/index + ./python/index + ./c-api/index + ./java-api/index + ./javascript-api/index + ./kotlin-api/index + ./swift-api/index + ./go-api/index + ./csharp-api/index + ./pascal-api/index + ./lazarus/index + ./wasm/index + ./android/index + ./harmony-os/index.rst + ./ios/index + ./flutter/index + ./websocket/index + ./hotwords/index + ./kws/index + ./punctuation/index + ./audio-tagging/index + ./spoken-language-identification/index + ./vad/index + ./pretrained_models/index + ./moonshine/index + ./sense-voice/index + ./FireRedAsr/index + +.. toctree:: + :maxdepth: 5 + :caption: Speaker diarization + + ./speaker-diarization/index + +.. toctree:: + :maxdepth: 5 + :caption: Speaker Identification + + ./speaker-identification/index + +.. toctree:: + :maxdepth: 5 + :caption: Speech enhancement + + ./speech-enhancment/index + +.. toctree:: + :maxdepth: 5 + :caption: RKNN + + ./rknn/index + +.. toctree:: + :maxdepth: 5 + :caption: tts + + ./tts/index diff --git a/docs/source/onnx/install/aarch64-embedded-linux.rst b/docs/source/onnx/install/aarch64-embedded-linux.rst new file mode 100644 index 000000000..7276523f1 --- /dev/null +++ b/docs/source/onnx/install/aarch64-embedded-linux.rst @@ -0,0 +1,316 @@ + +.. _sherpa-onnx-linux-aarch64-cross-compiling: + +Embedded Linux (aarch64) +======================== + +This page describes how to build `sherpa-onnx`_ for embedded Linux (aarch64, 64-bit) +with cross-compiling on an x64 machine with Ubuntu OS. + +.. warning:: + + By cross-compiling we mean that you do the compilation on a ``x86_64`` machine. + And you copy the generated binaries from a ``x86_64`` machine and run them on + an ``aarch64`` machine. + + If you want to compile `sherpa-onnx`_ on an ``aarch64`` machine directly, + please see :ref:`install_sherpa_onnx_on_linux`. + +.. note:: + + You can download pre-compiled binaries for ``aarch64`` from the following URL + ``_ + + Please always download the latest version. + + Example command to download the version ``1.9.12``: + + .. code-block:: bash + + # binaries built with shared libraries + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/aarch64/sherpa-onnx-v1.9.12-linux-aarch64-shared.tar.bz2 + + # binaries built with static link + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/aarch64/sherpa-onnx-v1.9.12-linux-aarch64-static.tar.bz2 + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + + # binaries built with shared libraries + wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/aarch64/sherpa-onnx-v1.9.12-linux-aarch64-shared.tar.bz2 + + # binaries built with static link + wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/aarch64/sherpa-onnx-v1.9.12-linux-aarch64-static.tar.bz2 + +.. hint:: + + We provide two colab notebooks + for you to try this section step by step. + + .. list-table:: + + * - Build with ``shared`` libraries + - Build with ``static`` libraries + * - |build sherpa-onnx for aarch64 shared colab notebook| + - |build sherpa-onnx for aarch64 static colab notebook| + + If you are using Windows/macOS or you don't want to setup your local environment + for cross-compiling, please use the above colab notebooks. + +.. |build sherpa-onnx for aarch64 shared colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_aarch64_cross_compiling_shared_libs.ipynb + +.. |build sherpa-onnx for aarch64 static colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_aarch64_cross_compiling_static_libs.ipynb + +.. _sherpa_onnx_install_for_aarch64_embedded_linux: + +Install toolchain +----------------- + +.. warning:: + + The toolchains for dynamic linking and static linking are different. + +.. warning:: + + The toolchains for dynamic linking and static linking are different. + +.. warning:: + + The toolchains for dynamic linking and static linking are different. + +The first step is to install a toolchain for cross-compiling. + +.. warning:: + + You can use any toolchain that is suitable for your platform. The toolchain + we use below is just an example. + +Visit ``_ + +We are going to download ``gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz``, +which has been uploaded to ``_. + +Assume you want to install it in the folder ``$HOME/software``: + +.. code-block:: bash + + mkdir -p $HOME/software + cd $HOME/software + + # Note: the following toolchain gcc 7.5 is for building shared libraries. + # Please see below to use gcc 10 to build static libaries. + # + # You would get link errors if you use gcc 7.5 to build static libraries. + # + wget https://huggingface.co/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + # wget https://hf-mirror.com/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz + + tar xvf gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu.tar.xz + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/software/gcc-linaro-7.5.0-2019.12-x86_64_aarch64-linux-gnu/bin:$PATH + +To check that we have installed the cross-compiling toolchain successfully, please +run: + +.. code-block:: bash + + aarch64-linux-gnu-gcc --version + +which should print the following log: + +.. code-block:: + + aarch64-linux-gnu-gcc (Linaro GCC 7.5-2019.12) 7.5.0 + Copyright (C) 2017 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Congratulations! You have successfully installed a toolchain for cross-compiling +`sherpa-onnx`_. + +Build sherpa-onnx +----------------- + +Finally, let us build `sherpa-onnx`_. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + export BUILD_SHARED_LIBS=ON + ./build-aarch64-linux-gnu.sh + +After building, you will get two binaries: + +.. code-block:: bash + + sherpa-onnx$ ls -lh build-aarch64-linux-gnu/install/bin/ + total 378K + -rwxr-xr-x 1 kuangfangjun root 187K Feb 21 21:55 sherpa-onnx + -rwxr-xr-x 1 kuangfangjun root 191K Feb 21 21:55 sherpa-onnx-alsa + +.. note:: + + Please also copy the ``onnxruntime`` lib to your embedded systems and put it + into the same directory as ``sherpa-onnx`` and ``sherpa-onnx-alsa``. + + + .. code-block:: bash + + sherpa-onnx$ ls -lh build-aarch64-linux-gnu/install/lib/*onnxruntime* + lrw-r--r-- 1 kuangfangjun root 24 Feb 21 21:38 build-aarch64-linux-gnu/install/lib/libonnxruntime.so -> libonnxruntime.so.1.14.0 + -rw-r--r-- 1 kuangfangjun root 15M Feb 21 21:38 build-aarch64-linux-gnu/install/lib/libonnxruntime.so.1.14.0 + + +That's it! + +.. hint:: + + - ``sherpa-onnx`` is for decoding a single file + - ``sherpa-onnx-alsa`` is for real-time speech recongition by reading + the microphone with `ALSA `_ + +.. _sherpa-onnx-alsa: + +sherpa-onnx-alsa +---------------- + +.. caution:: + + We recommend that you use ``sherpa-onnx-alsa`` on embedded systems such + as Raspberry pi. + + You need to provide a ``device_name`` when invoking ``sherpa-onnx-alsa``. + We describe below how to find the device name for your microphone. + + Run the following command: + + .. code-block:: bash + + arecord -l + + to list all avaliable microphones for recording. If it complains that + ``arecord: command not found``, please use ``sudo apt-get install alsa-utils`` + to install it. + + If the above command gives the following output: + + .. code-block:: bash + + **** List of CAPTURE Hardware Devices **** + card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + + In this case, I only have 1 microphone. It is ``card 3`` and that card + has only ``device 0``. To select ``card 3`` and ``device 0`` on that card, + we need to pass ``plughw:3,0`` to ``sherpa-onnx-alsa``. (Note: It has the format + ``plughw:card_number,device_index``.) + + For instance, you have to use + + .. code-block:: bash + + ./sherpa-onnx-alsa \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + plughw:3,0 + + Please change the card number and also the device index on the selected card + accordingly in your own situation. Otherwise, you won't be able to record + with your microphone. + +Please read :ref:`sherpa-onnx-pre-trained-models` for usages about +the generated binaries. + +.. hint:: + + If you want to select a pre-trained model for Raspberry that can be + run on real-time, we recommend you to + use :ref:`sherpa_onnx_zipformer_transducer_models`. + + +Please create an issue at ``_ +if you have any problems. + +How to build static libraries and static linked binaries +-------------------------------------------------------- + +If you want to build static libraries and static linked binaries, please first +download a cross compile toolchain with GCC >= 9.0. The following is an example: + +.. code-block:: bash + + mkdir -p $HOME/software + cd $HOME/software + wget -q https://huggingface.co/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-10.3-2021.07-x86_64-aarch64-none-linux-gnu.tar.xz + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + # wget -q https://hf-mirror.com/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-10.3-2021.07-x86_64-aarch64-none-linux-gnu.tar.xz + + tar xf gcc-arm-10.3-2021.07-x86_64-aarch64-none-linux-gnu.tar.xz + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/software/gcc-arm-10.3-2021.07-x86_64-aarch64-none-linux-gnu/bin:$PATH + +To check that we have installed the cross-compiling toolchain successfully, please +run: + +.. code-block:: bash + + aarch64-none-linux-gnu-gcc --version + +which should print the following log: + +.. code-block:: + + aarch64-none-linux-gnu-gcc (GNU Toolchain for the A-profile Architecture 10.3-2021.07 (arm-10.29)) 10.3.1 20210621 + Copyright (C) 2020 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Now you can build static libraries and static linked binaries with the following commands: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + export BUILD_SHARED_LIBS=OFF + ./build-aarch64-linux-gnu.sh + +You can use the following commands to check that the generated binaries are indeed static linked: + +.. code-block:: bash + + $ cd build-aarch64-linux-gnu/bin + + $ ldd sherpa-onnx-alsa + not a dynamic executable + + $ readelf -d sherpa-onnx-alsa + + Dynamic section at offset 0xed9950 contains 30 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libasound.so.2] + 0x0000000000000001 (NEEDED) Shared library: [libdl.so.2] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x000000000000000f (RPATH) Library rpath: [$ORIGIN:/star-fj/fangjun/open-source/sherpa-onnx/build-aarch64-linux-gnu/_deps/onnxruntime-sr + c/lib:] + 0x000000000000000c (INIT) 0x404218 diff --git a/docs/source/onnx/install/arm-embedded-linux.rst b/docs/source/onnx/install/arm-embedded-linux.rst new file mode 100644 index 000000000..38691731e --- /dev/null +++ b/docs/source/onnx/install/arm-embedded-linux.rst @@ -0,0 +1,322 @@ +.. _sherpa-onnx-embedded-linux-arm-install: + +Embedded Linux (arm) +==================== + +This page describes how to build `sherpa-onnx`_ for embedded Linux (arm, 32-bit) +with ``cross-compiling`` on an x86 machine with Ubuntu OS. + +.. caution:: + + If you want to build `sherpa-onnx`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_onnx_on_linux` instead. + +.. caution:: + + If you want to build `sherpa-onnx`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_onnx_on_linux` instead. + +.. caution:: + + If you want to build `sherpa-onnx`_ directly on your board, please don't + use this document. Refer to :ref:`install_sherpa_onnx_on_linux` instead. + +.. hint:: + + This page is for cross-compiling. + +.. note:: + + You can download pre-compiled binaries for 32-bit ``ARM`` from the following URL + ``_ + + Please always download the latest version. + + Example command to download the version ``1.9.12``: + + .. code-block:: bash + + # binaries built with shared libraries + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/arm32/sherpa-onnx-v1.9.12-linux-arm-gnueabihf-shared.tar.bz2 + + # binaries built with static link + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/arm32/sherpa-onnx-v1.9.12-linux-arm-gnueabihf-static.tar.bz2 + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + + # binaries built with shared libraries + wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/arm32/sherpa-onnx-v1.9.12-linux-arm-gnueabihf-shared.tar.bz2 + + # binaries built with static link + wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/arm32/sherpa-onnx-v1.9.12-linux-arm-gnueabihf-static.tar.bz2 + +.. hint:: + + We provide two colab notebooks + for you to try this section step by step. + + .. list-table:: + + * - Build with ``shared`` libraries + - Build with ``static`` libraries + * - |build sherpa-onnx for arm shared colab notebook| + - |build sherpa-onnx for arm static colab notebook| + + If you are using Windows/macOS or you don't want to setup your local environment + for cross-compiling, please use the above colab notebooks. + +.. |build sherpa-onnx for arm shared colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_arm_cross_compiling_shared_libs.ipynb + +.. |build sherpa-onnx for arm static colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_arm_cross_compiling_static_libs.ipynb + +Install toolchain +----------------- + +The first step is to install a toolchain for cross-compiling. + +.. warning:: + + You can use any toolchain that is suitable for your platform. The toolchain + we use below is just an example. + +Visit ``_ to download the toolchain: + +We are going to download ``gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz``, +which has been uploaded to ``_. + +Assume you want to install it in the folder ``$HOME/software``: + +.. code-block:: bash + + mkdir -p $HOME/software + cd $HOME/software + wget -q https://huggingface.co/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + # wget -q https://hf-mirror.com/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz + + tar xf gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/software/gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf/bin:$PATH + +To check that we have installed the cross-compiling toolchain successfully, please +run: + +.. code-block:: bash + + arm-none-linux-gnueabihf-gcc --version + +which should print the following log: + +.. code-block:: + + arm-none-linux-gnueabihf-gcc (GNU Toolchain for the A-profile Architecture 10.3-2021.07 (arm-10.29)) 10.3.1 20210621 + Copyright (C) 2020 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Congratulations! You have successfully installed a toolchain for cross-compiling +`sherpa-onnx`_. + +Build sherpa-onnx +----------------- + +Finally, let us build `sherpa-onnx`_. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + export BUILD_SHARED_LIBS=ON + ./build-arm-linux-gnueabihf.sh + +After building, you will get the following binaries: + +.. code-block:: bash + + $ ls -lh build-arm-linux-gnueabihf/install/bin/ + + total 1.2M + -rwxr-xr-x 1 kuangfangjun root 395K Jul 7 16:28 sherpa-onnx + -rwxr-xr-x 1 kuangfangjun root 391K Jul 7 16:28 sherpa-onnx-alsa + -rwxr-xr-x 1 kuangfangjun root 351K Jul 7 16:28 sherpa-onnx-offline + +That's it! + +.. hint:: + + - ``sherpa-onnx`` is for decoding a single file using a streaming model + - ``sherpa-onnx-offline`` is for decoding a single file using a non-streaming model + - ``sherpa-onnx-alsa`` is for real-time speech recongition using a streaming model by reading + the microphone with `ALSA `_ + +.. caution:: + + We recommend that you use ``sherpa-onnx-alsa`` on embedded systems such + as Raspberry pi. + + You need to provide a ``device_name`` when invoking ``sherpa-onnx-alsa``. + We describe below how to find the device name for your microphone. + + Run the following command: + + .. code-block:: bash + + arecord -l + + to list all avaliable microphones for recording. If it complains that + ``arecord: command not found``, please use ``sudo apt-get install alsa-utils`` + to install it. + + If the above command gives the following output: + + .. code-block:: bash + + **** List of CAPTURE Hardware Devices **** + card 0: Audio [Axera Audio], device 0: 49ac000.i2s_mst-es8328-hifi-analog es8328-hifi-analog-0 [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + + In this case, I only have 1 microphone. It is ``card 0`` and that card + has only ``device 0``. To select ``card 0`` and ``device 0`` on that card, + we need to pass ``plughw:0,0`` to ``sherpa-onnx-alsa``. (Note: It has the format + ``plughw:card_number,device_index``.) + + For instance, you have to use + + .. code-block:: bash + + # Note: We use int8 models below. + ./bin/sherpa-onnx-alsa \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-64.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-64.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-64.int8.onnx \ + "plughw:0,0" + + Please change the card number and also the device index on the selected card + accordingly in your own situation. Otherwise, you won't be able to record + with your microphone. + +Please read :ref:`sherpa-onnx-pre-trained-models` for usages about +the generated binaries. + +Read below if you want to learn more. + +.. hint:: + + By default, all external dependencies are statically linked. That means, + the generated binaries are self-contained (except that it requires the + onnxruntime shared library at runtime). + + You can use the following commands to check that and you will find + they depend only on system libraries. + + .. code-block:: bash + + $ readelf -d build-arm-linux-gnueabihf/install/bin/sherpa-onnx + + Dynamic section at offset 0x61ee8 contains 30 entries: + Tag Type Name/Value + 0x00000001 (NEEDED) Shared library: [libonnxruntime.so.1.14.0] + 0x00000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x00000001 (NEEDED) Shared library: [libm.so.6] + 0x00000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x00000001 (NEEDED) Shared library: [libc.so.6] + 0x0000000f (RPATH) Library rpath: [$ORIGIN:$ORIGIN/../lib:$ORIGIN/../../../sherpa_onnx/lib] + + $ readelf -d build-arm-linux-gnueabihf/install/bin/sherpa-onnx-alsa + + Dynamic section at offset 0x60ee0 contains 31 entries: + Tag Type Name/Value + 0x00000001 (NEEDED) Shared library: [libasound.so.2] + 0x00000001 (NEEDED) Shared library: [libonnxruntime.so.1.14.0] + 0x00000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x00000001 (NEEDED) Shared library: [libm.so.6] + 0x00000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x00000001 (NEEDED) Shared library: [libc.so.6] + 0x0000000f (RPATH) Library rpath: [$ORIGIN] + + +Please create an issue at ``_ +if you have any problems. + +How to build static libraries and static linked binaries +-------------------------------------------------------- + +If you want to build static libraries and static linked binaries, please first +download a cross compile toolchain with GCC >= 9.0. The following is an example: + +.. code-block:: bash + + mkdir -p $HOME/software + cd $HOME/software + wget -q https://huggingface.co/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + wget -q https://hf-mirror.com/csukuangfj/sherpa-ncnn-toolchains/resolve/main/gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz + + tar xf gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf.tar.xz + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/software/gcc-arm-10.3-2021.07-x86_64-arm-none-linux-gnueabihf/bin:$PATH + + +To check that we have installed the cross-compiling toolchain successfully, please +run: + +.. code-block:: bash + + arm-none-linux-gnueabihf-gcc --version + +which should print the following log: + +.. code-block:: + + arm-none-linux-gnueabihf-gcc (GNU Toolchain for the A-profile Architecture 10.3-2021.07 (arm-10.29)) 10.3.1 20210621 + Copyright (C) 2020 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Now you can build static libraries and static linked binaries with the following commands: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + export BUILD_SHARED_LIBS=OFF + ./build-arm-linux-gnueabihf.sh + +You can use the following commands to check that the generated binaries are indeed static linked: + +.. code-block:: bash + + $ cd build-arm-linux-gnueabihf/bin + + $ ldd sherpa-onnx-alsa + not a dynamic executable + + $ readelf -d sherpa-onnx-alsa + + Dynamic section at offset 0xa68eb4 contains 31 entries: + Tag Type Name/Value + 0x00000001 (NEEDED) Shared library: [libasound.so.2] + 0x00000001 (NEEDED) Shared library: [libdl.so.2] + 0x00000001 (NEEDED) Shared library: [libm.so.6] + 0x00000001 (NEEDED) Shared library: [libpthread.so.0] + 0x00000001 (NEEDED) Shared library: [libc.so.6] + 0x00000001 (NEEDED) Shared library: [ld-linux-armhf.so.3] + 0x0000000f (RPATH) Library rpath: [$ORIGIN:/star-fj/fangjun/open-source/sherpa-onnx/build-arm-linux-gnueabihf/_deps/espeak_ng-src/lib:/star-fj/fangjun/open-source/sherpa-onnx/build-arm-linux-gnueabihf/_deps/onnxruntime-src/lib:] + 0x0000000c (INIT) 0x13550 diff --git a/docs/source/onnx/install/index.rst b/docs/source/onnx/install/index.rst new file mode 100644 index 000000000..ccdfd12f8 --- /dev/null +++ b/docs/source/onnx/install/index.rst @@ -0,0 +1,28 @@ +.. _install_sherpa_onnx: + +Installation +============ + +In this section, we describe how to install `sherpa-onnx`_ on various platforms. + +**Requirements**: + + - ``CMake >= 3.13`` + - A compiler that supports at least ``C++14`` + +.. hint:: + + You can use ``pip install cmake`` to install the latest cmake. + +.. toctree:: + :maxdepth: 2 + + ./linux.rst + ./macos.rst + ./windows.rst + ./aarch64-embedded-linux.rst + ./arm-embedded-linux.rst + ./riscv64-embedded-linux.rst + +If you want to build an Android app, please refer to :ref:`sherpa-onnx-android`. +If you want to build an iOS app, please refer to :ref:`sherpa-onnx-ios`. diff --git a/docs/source/onnx/install/linux.rst b/docs/source/onnx/install/linux.rst new file mode 100644 index 000000000..c666152d8 --- /dev/null +++ b/docs/source/onnx/install/linux.rst @@ -0,0 +1,93 @@ +.. _install_sherpa_onnx_on_linux: + +Linux +===== + +This page describes how to build `sherpa-onnx`_ on Linux. + +All you need is to run: + +.. tabs:: + + .. tab:: CPU + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + # By default, it builds static libaries and uses static link. + cmake -DCMAKE_BUILD_TYPE=Release .. + + # If you have GCC<=10, e.g., use Ubuntu <= 18.04 or use CentOS<=7, please + # use the following command to build shared libs; otherwise, you would + # get link errors from libonnxruntime.a + # + # cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON .. + # + # + make -j6 + + .. tab:: Nvidia GPU (CUDA, x64) + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DSHERPA_ONNX_ENABLE_GPU=ON .. + make -j6 + + .. hint:: + + You need to install CUDA toolkit 11.8. Otherwise, you would get + errors at runtime. + + You can refer to ``_ + to install CUDA toolkit. + + .. tab:: Nvidia GPU (CUDA 10.2, arm64, e.g., Jetson Nano B01) + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + cmake \ + -DSHERPA_ONNX_LINUX_ARM64_GPU_ONNXRUNTIME_VERSION=1.11.0 \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_GPU=ON \ + .. + + make + + .. tab:: Nvidia GPU (CUDA 11.4, arm64, e.g., Jetson Orin NX) + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + cmake \ + -DSHERPA_ONNX_LINUX_ARM64_GPU_ONNXRUNTIME_VERSION=1.16.0 \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_GPU=ON \ + .. + + make + +After building, you will find an executable ``sherpa-onnx`` inside the ``bin`` directory. + +That's it! + +Please refer to :ref:`sherpa-onnx-pre-trained-models` for a list of pre-trained +models. diff --git a/docs/source/onnx/install/macos.rst b/docs/source/onnx/install/macos.rst new file mode 100644 index 000000000..a16e06dcb --- /dev/null +++ b/docs/source/onnx/install/macos.rst @@ -0,0 +1,26 @@ +macOS +===== + +This page describes how to build `sherpa-onnx`_ on macOS. + +.. hint:: + + It supports both Intel and Apple Silicon (e.g., M1). + +All you need is to run: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + make -j6 + +After building, you will find an executable ``sherpa-onnx`` inside the ``bin`` directory. + +That's it! + +Please refer to :ref:`sherpa-onnx-pre-trained-models` for a list of pre-trained +models. diff --git a/docs/source/onnx/install/riscv64-embedded-linux.rst b/docs/source/onnx/install/riscv64-embedded-linux.rst new file mode 100644 index 000000000..7ab00efbb --- /dev/null +++ b/docs/source/onnx/install/riscv64-embedded-linux.rst @@ -0,0 +1,429 @@ +Embedded Linux (riscv64) +======================== + +This page describes how to build `sherpa-onnx`_ for embedded Linux (RISC-V, 64-bit) +with cross-compiling on an x64 machine with Ubuntu OS. It also demonstrates +how to use ``qemu`` to run the compiled binaries. + +.. hint:: + + We provide a colab notebook + |build sherpa-onnx for risc-v colab notebook| + for you to try this section step by step. + + If you are using Windows/macOS or you don't want to setup your local environment + for cross-compiling, please use the above colab notebook. + +.. |build sherpa-onnx for risc-v colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_RISC_V.ipynb + +.. note:: + + You can download pre-compiled binaries for ``riscv64`` from the following URL + ``_ + + Please always download the latest version. + + Example command to download the version ``1.9.12``: + + .. code-block:: bash + + # binaries built with shared libraries + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/riscv64/sherpa-onnx-v1.9.12-linux-riscv64-shared.tar.bz2 + + # For users from China + # 中国国内用户,如果访问不了 huggingface, 请使用 + + # binaries built with shared libraries + # wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/riscv64/sherpa-onnx-v1.9.12-linux-riscv64-shared.tar.bz2 + +Install toolchain +----------------- + +The first step is to install a toolchain for cross-compiling. + +.. code-block:: bash + + mkdir -p $HOME/toolchain + + wget -q https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz + + tar xf ./Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz --strip-components 1 -C $HOME/toolchain + +Next, we need to set the following environment variable: + +.. code-block:: bash + + export PATH=$HOME/toolchain/bin:$PATH + +To check that you have installed the toolchain successfully, please run + +.. code-block:: bash + + $ riscv64-unknown-linux-gnu-gcc --version + + riscv64-unknown-linux-gnu-gcc (Xuantie-900 linux-5.10.4 glibc gcc Toolchain V2.6.1 B-20220906) 10.2.0 + Copyright (C) 2020 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + $ riscv64-unknown-linux-gnu-g++ --version + + riscv64-unknown-linux-gnu-g++ (Xuantie-900 linux-5.10.4 glibc gcc Toolchain V2.6.1 B-20220906) 10.2.0 + Copyright (C) 2020 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +Build sherpa-onnx +----------------- + +Next, let us build `sherpa-onnx`_. + +.. hint:: + + Currently, only shared libraries are supported. We ``will`` support + static linking in the future. + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + ./build-riscv64-linux-gnu.sh + +After building, you will get the following files + +.. code-block:: bash + + $ ls -lh build-riscv64-linux-gnu/install/bin + $ echo "---" + $ ls -lh build-riscv64-linux-gnu/install/lib + + total 292K + -rwxr-xr-x 1 root root 23K Mar 20 09:41 sherpa-onnx + -rwxr-xr-x 1 root root 27K Mar 20 09:41 sherpa-onnx-alsa + -rwxr-xr-x 1 root root 31K Mar 20 09:41 sherpa-onnx-alsa-offline + -rwxr-xr-x 1 root root 40K Mar 20 09:41 sherpa-onnx-alsa-offline-speaker-identification + -rwxr-xr-x 1 root root 23K Mar 20 09:41 sherpa-onnx-keyword-spotter + -rwxr-xr-x 1 root root 27K Mar 20 09:41 sherpa-onnx-keyword-spotter-alsa + -rwxr-xr-x 1 root root 23K Mar 20 09:41 sherpa-onnx-offline + -rwxr-xr-x 1 root root 39K Mar 20 09:41 sherpa-onnx-offline-parallel + -rwxr-xr-x 1 root root 19K Mar 20 09:41 sherpa-onnx-offline-tts + -rwxr-xr-x 1 root root 31K Mar 20 09:41 sherpa-onnx-offline-tts-play-alsa + --- + total 30M + -rw-r--r-- 1 root root 256K Mar 20 09:41 libespeak-ng.so + -rw-r--r-- 1 root root 71K Mar 20 09:41 libkaldi-decoder-core.so + -rw-r--r-- 1 root root 67K Mar 20 09:41 libkaldi-native-fbank-core.so + -rw-r--r-- 1 root root 13M Mar 20 09:35 libonnxruntime.so + -rw-r--r-- 1 root root 13M Mar 20 09:35 libonnxruntime.so.1.14.1 + lrwxrwxrwx 1 root root 23 Mar 20 09:41 libpiper_phonemize.so -> libpiper_phonemize.so.1 + lrwxrwxrwx 1 root root 27 Mar 20 09:41 libpiper_phonemize.so.1 -> libpiper_phonemize.so.1.2.0 + -rw-r--r-- 1 root root 395K Mar 20 09:41 libpiper_phonemize.so.1.2.0 + -rw-r--r-- 1 root root 1.3M Mar 20 09:41 libsherpa-onnx-core.so + lrwxrwxrwx 1 root root 23 Mar 20 09:41 libsherpa-onnx-fst.so -> libsherpa-onnx-fst.so.6 + -rw-r--r-- 1 root root 1.4M Mar 20 09:41 libsherpa-onnx-fst.so.6 + -rw-r--r-- 1 root root 752K Mar 20 09:41 libsherpa-onnx-kaldifst-core.so + -rw-r--r-- 1 root root 202K Mar 20 09:41 libucd.so + drwxr-xr-x 2 root root 4.0K Mar 20 09:41 pkgconfig + +.. code-block:: bash + + $ file build-riscv64-linux-gnu/install/bin/sherpa-onnx + + build-riscv64-linux-gnu/install/bin/sherpa-onnx: ELF 64-bit LSB executable, UCB RISC-V, RVC, double-float ABI, version 1 (GNU/Linux), dynamically linked, interpreter /lib/ld-linux-riscv64-lp64d.so.1, for GNU/Linux 4.15.0, stripped + +.. code-block:: bash + + $ readelf -d build-riscv64-linux-gnu/install/bin/sherpa-onnx + +.. code-block:: bash + + $ find $HOME/toolchain/ -name ld-linux-riscv64-lp64d.so.1 + + Dynamic section at offset 0x4d40 contains 39 entries: + Tag Type Name/Value + 0x0000000000000001 (NEEDED) Shared library: [libsherpa-onnx-core.so] + 0x0000000000000001 (NEEDED) Shared library: [libkaldi-native-fbank-core.so] + 0x0000000000000001 (NEEDED) Shared library: [libkaldi-decoder-core.so] + 0x0000000000000001 (NEEDED) Shared library: [libsherpa-onnx-kaldifst-core.so] + 0x0000000000000001 (NEEDED) Shared library: [libsherpa-onnx-fst.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libpiper_phonemize.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libonnxruntime.so.1.14.1] + 0x0000000000000001 (NEEDED) Shared library: [libespeak-ng.so] + 0x0000000000000001 (NEEDED) Shared library: [libucd.so] + 0x0000000000000001 (NEEDED) Shared library: [libstdc++.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libm.so.6] + 0x0000000000000001 (NEEDED) Shared library: [libgcc_s.so.1] + 0x0000000000000001 (NEEDED) Shared library: [libpthread.so.0] + 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] + 0x000000000000000f (RPATH) Library rpath: [$ORIGIN:$ORIGIN/../lib:$ORIGIN/../../../sherpa_onnx/lib] + 0x0000000000000020 (PREINIT_ARRAY) 0x15d20 + 0x0000000000000021 (PREINIT_ARRAYSZ) 8 (bytes) + 0x0000000000000019 (INIT_ARRAY) 0x15d28 + 0x000000000000001b (INIT_ARRAYSZ) 16 (bytes) + 0x000000000000001a (FINI_ARRAY) 0x15d38 + 0x000000000000001c (FINI_ARRAYSZ) 8 (bytes) + 0x0000000000000004 (HASH) 0x10280 + 0x000000006ffffef5 (GNU_HASH) 0x10418 + 0x0000000000000005 (STRTAB) 0x10bd8 + 0x0000000000000006 (SYMTAB) 0x105f0 + 0x000000000000000a (STRSZ) 3652 (bytes) + 0x000000000000000b (SYMENT) 24 (bytes) + 0x0000000000000015 (DEBUG) 0x0 + 0x0000000000000003 (PLTGOT) 0x16000 + 0x0000000000000002 (PLTRELSZ) 1056 (bytes) + 0x0000000000000014 (PLTREL) RELA + 0x0000000000000017 (JMPREL) 0x11bb0 + 0x0000000000000007 (RELA) 0x11b80 + 0x0000000000000008 (RELASZ) 1104 (bytes) + 0x0000000000000009 (RELAENT) 24 (bytes) + 0x000000006ffffffe (VERNEED) 0x11aa0 + 0x000000006fffffff (VERNEEDNUM) 4 + 0x000000006ffffff0 (VERSYM) 0x11a1c + 0x0000000000000000 (NULL) 0x0 + + /root/toolchain/sysroot/lib/ld-linux-riscv64-lp64d.so.1 + + +That's it! + +Please create an issue at ``_ +if you have any problems. + +Read more if you want to run the binaries with ``qemu``. + +qemu +---- + +.. hint:: + + This subsection works only on x64 Linux. + +.. caution:: + + Please don't use any other methods to install ``qemu-riscv64``. Only the + method listed in this subsection is known to work. + +Please use the following command to download the ``qemu-riscv64`` binary. + +.. code-block:: bash + + mkdir -p $HOME/qemu + + mkdir -p /tmp + cd /tmp + wget -q https://files.pythonhosted.org/packages/21/f4/733f29c435987e8bb264a6504c7a4ea4c04d0d431b38a818ab63eef082b9/xuantie_qemu-20230825-py3-none-manylinux1_x86_64.whl + + unzip xuantie_qemu-20230825-py3-none-manylinux1_x86_64.whl + cp -v ./qemu/qemu-riscv64 $HOME/qemu + + export PATH=$HOME/qemu:$PATH + +To check that we have installed ``qemu-riscv64`` successfully, please run: + +.. code-block:: bash + + qemu-riscv64 -h + +which should give the following output:: + + usage: qemu-riscv64 [options] program [arguments...] + Linux CPU emulator (compiled for riscv64 emulation) + + Options and associated environment variables: + + Argument Env-variable Description + -h print this help + -help + -g port QEMU_GDB wait gdb connection to 'port' + -L path QEMU_LD_PREFIX set the elf interpreter prefix to 'path' + -s size QEMU_STACK_SIZE set the stack size to 'size' bytes + -cpu model QEMU_CPU select CPU (-cpu help for list) + -E var=value QEMU_SET_ENV sets targets environment variable (see below) + -U var QEMU_UNSET_ENV unsets targets environment variable (see below) + -0 argv0 QEMU_ARGV0 forces target process argv[0] to be 'argv0' + -r uname QEMU_UNAME set qemu uname release string to 'uname' + -B address QEMU_GUEST_BASE set guest_base address to 'address' + -R size QEMU_RESERVED_VA reserve 'size' bytes for guest virtual address space + -d item[,...] QEMU_LOG enable logging of specified items (use '-d help' for a list of items) + -dfilter range[,...] QEMU_DFILTER filter logging based on address range + -D logfile QEMU_LOG_FILENAME write logs to 'logfile' (default stderr) + -p pagesize QEMU_PAGESIZE set the host page size to 'pagesize' + -singlestep QEMU_SINGLESTEP run in singlestep mode + -strace QEMU_STRACE log system calls + -pctrace QEMU_PCTRACE log pctrace + -seed QEMU_RAND_SEED Seed for pseudo-random number generator + -trace QEMU_TRACE [[enable=]][,events=][,file=] + -csky-extend CSKY_EXTEND [tb_trace=][,jcount_start=][,jcount_end=][vdsp=][exit_addr=][denormal=] + -CPF CSKY_PROFILING + -csky-trace CSKY_TRACE [port=][,tb_trace=][,mem_trace=][,auto_trace=][,start=addr][,exit=addr] + -plugin QEMU_PLUGIN [file=][,arg=] + -version QEMU_VERSION display version information and exit + + Defaults: + QEMU_LD_PREFIX = /usr/gnemul/qemu-riscv64 + QEMU_STACK_SIZE = 8388608 byte + + You can use -E and -U options or the QEMU_SET_ENV and + QEMU_UNSET_ENV environment variables to set and unset + environment variables for the target process. + It is possible to provide several variables by separating them + by commas in getsubopt(3) style. Additionally it is possible to + provide the -E and -U options multiple times. + The following lines are equivalent: + -E var1=val2 -E var2=val2 -U LD_PRELOAD -U LD_DEBUG + -E var1=val2,var2=val2 -U LD_PRELOAD,LD_DEBUG + QEMU_SET_ENV=var1=val2,var2=val2 QEMU_UNSET_ENV=LD_PRELOAD,LD_DEBUG + Note that if you provide several changes to a single variable + the last change will stay in effect. + + See for how to report bugs. + More information on the QEMU project at . + +We describe below how to use ``qemu-riscv64`` to run speech-to-text and text-to-speech. + + +Run speech-to-text with qemu +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We use :ref:`sherpa_onnx_streaming_zipformer_en_20M_2023_02_17` as the test model. + +.. note:: + + You can select any model from :ref:`sherpa-onnx-pre-trained-models`. + + +Please use the following command to download the model: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 + +Now you can use the following command to run it with ``qemu-riscv64``:: + + cd /path/to/sherpa-onnx + + export PATH=$HOME/qemu:$PATH + + qemu-riscv64 build-riscv64-linux-gnu/install/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +It will throw the following error:: + + qemu-riscv64: Could not open '/lib/ld-linux-riscv64-lp64d.so.1': No such file or directory + +Please use the following command instead:: + + cd /path/to/sherpa-onnx + + export PATH=$HOME/qemu:$PATH + export QEMU_LD_PREFIX=$HOME/toolchain/sysroot + + qemu-riscv64 build-riscv64-linux-gnu/install/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +It will throw a second error:: + + build-riscv64-linux-gnu/install/bin/sherpa-onnx: error while loading shared libraries: ld-linux-riscv64xthead-lp64d.so.1: cannot open shared object file: No such file or directory + +Please use the following command instead:: + + cd /path/to/sherpa-onnx + + export PATH=$HOME/qemu:$PATH + export QEMU_LD_PREFIX=$HOME/toolchain/sysroot + export LD_LIBRARY_PATH=$HOME/toolchain/sysroot/lib:$LD_LIBRARY_PATH + + qemu-riscv64 build-riscv64-linux-gnu/install/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +Finally, it prints the following output:: + + /content/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 build-riscv64-linux-gnu/install/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx", decoder="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0) + ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + Elapsed seconds: 70, Real time factor (RTF): 11 + THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLELS + { "text": " THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLELS", "tokens": [ " THE", " YE", "LL", "OW", " LA", "M", "P", "S", " WOULD", " LIGHT", " UP", " HE", "RE", " AND", " THERE", " THE", " S", "QUA", "LI", "D", " ", "QUA", "R", "TER", " OF", " THE", " B", "RA", "FF", "L", "EL", "S" ], "timestamps": [ 2.04, 2.16, 2.28, 2.36, 2.52, 2.64, 2.68, 2.76, 2.92, 3.08, 3.40, 3.60, 3.72, 3.88, 4.12, 4.48, 4.64, 4.68, 4.84, 4.96, 5.16, 5.20, 5.32, 5.36, 5.60, 5.72, 5.92, 5.96, 6.08, 6.24, 6.36, 6.60 ], "ys_probs": [ -0.454799, -0.521409, -0.345871, -0.001244, -0.240359, -0.013972, -0.010445, -0.051701, -0.000371, -0.171570, -0.002205, -0.026703, -0.006903, -0.021168, -0.011662, -0.001059, -0.005089, -0.000273, -0.575480, -0.024973, -0.159344, -0.000042, -0.011082, -0.187136, -0.004002, -0.292751, -0.084873, -0.241302, -0.543844, -0.428164, -0.853198, -0.093776 ], "lm_probs": [ ], "context_scores": [ ], "segment": 0, "start_time": 0.00, "is_final": false} + +.. hint:: + + As you can see, the RTF is 11, indicating that it is very slow to run the model + with the ``qemu`` simulator. Running on a real RISC-V board should be much faster. + +Run text-to-speech with qemu +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please visit ``_ +to download a text-to-speech model. We use the following model +``vits-piper-en_US-amy-low.tar.bz2``:: + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 + tar xf vits-piper-en_US-amy-low.tar.bz2 + rm vits-piper-en_US-amy-low.tar.bz2 + +After downloading the model, we can use the following command to run it:: + + cd /path/to/sherpa-onnx + + export PATH=$HOME/qemu:$PATH + export QEMU_LD_PREFIX=$HOME/toolchain/sysroot + export LD_LIBRARY_PATH=$HOME/toolchain/sysroot/lib:$LD_LIBRARY_PATH + + qemu-riscv64 build-riscv64-linux-gnu/install/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + --output-filename=./a-test.wav \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + +The log of the above command is given below:: + + /content/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 build-riscv64-linux-gnu/install/bin/sherpa-onnx-offline-tts --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data --output-filename=./a-test.wav 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.' + + Elapsed seconds: 270.745 s + Audio duration: 7.904 s + Real-time factor (RTF): 270.745/7.904 = 34.254 + The text is: Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.. Speaker ID: 0 + Saved to ./a-test.wav successfully! + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
a-test.wav + + + Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone. +
diff --git a/docs/source/onnx/install/windows.rst b/docs/source/onnx/install/windows.rst new file mode 100644 index 000000000..0d0da3c05 --- /dev/null +++ b/docs/source/onnx/install/windows.rst @@ -0,0 +1,131 @@ +Windows +======= + +This page describes how to build `sherpa-onnx`_ on Windows. + + +.. hint:: + + MinGW is known not to work. + Please install ``Visual Studio`` before you continue. + +.. note:: + + You can download pre-compiled binaries for both 32-bit and 64-bit Windows + from the following URL ``_. + + Please always download the latest version. + + URLs to download the version ``1.9.12`` is given below. + + .. list-table:: + + * - 64-bit Windows (static lib) + - ``_ + * - 64-bit Windows (shared lib) + - ``_ + * - 32-bit Windows (static lib) + - ``_ + * - 32-bit Windows (shared lib) + - ``_ + + If you cannot access ``huggingface.co``, then please replace ``huggingface.co`` with + ``hf-mirror.com``. + + + +64-bit Windows (x64) +-------------------- + +All you need is to run: + +.. tabs:: + + .. tab:: CPU + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + cmake --build . --config Release + + .. tab:: Nvidia GPU (CUDA) + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DSHERPA_ONNX_ENABLE_GPU=ON .. + cmake --build . --config Release + + .. hint:: + + You need to install CUDA toolkit 11.8. Otherwise, you would get + errors at runtime. + + Caution: Please install cuda toolkit 11.8. Other versions do ``NOT`` work! + + Caution: Please install cuda toolkit 11.8. Other versions do ``NOT`` work! + + Caution: Please install cuda toolkit 11.8. Other versions do ``NOT`` work! + +After building, you will find an executable ``sherpa-onnx.exe`` inside the ``bin/Release`` directory. + +That's it! + +Please refer to :ref:`sherpa-onnx-pre-trained-models` for a list of pre-trained +models. + +32-bit Windows (x86) +-------------------- + +.. hint:: + + It does not support NVIDIA GPU for ``Win32/x86``. + +All you need is to run: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + # Please select one toolset among VS 2015, 2017, 2019, and 2022 below + # We use VS 2022 as an example. + + # For Visual Studio 2015 + # cmake -T v140,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + # For Visual Studio 2017 + # cmake -T v141,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + # For Visual Studio 2019 + # cmake -T v142,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + # For Visual Studio 2022 + cmake -T v143,host=x64 -A Win32 -D CMAKE_BUILD_TYPE=Release .. + + cmake --build . --config Release + +After building, you will find an executable ``sherpa-onnx.exe`` inside the ``bin/Release`` directory. + +That's it! + +Please refer to :ref:`sherpa-onnx-pre-trained-models` for a list of pre-trained +models. + +.. hint:: + + By default, it builds static libraries of `sherpa-onnx`_. To get dynamic/shared + libraries, please pass ``-DBUILD_SHARED_LIBS=ON`` to ``cmake``. That is, use + + .. code-block:: bash + + cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON .. diff --git a/docs/source/onnx/ios/build-sherpa-onnx-swift.rst b/docs/source/onnx/ios/build-sherpa-onnx-swift.rst new file mode 100644 index 000000000..a0a39cc41 --- /dev/null +++ b/docs/source/onnx/ios/build-sherpa-onnx-swift.rst @@ -0,0 +1,321 @@ +Build sherpa-onnx for iOS +========================= + +This section describes how to build `sherpa-onnx`_ for ``iPhone`` and ``iPad``. + +Requirement +----------- + +.. warning:: + + The minimum deployment requires the iOS version ``>= 13.0``. + +Before we continue, please make sure the following requirements are satisfied: + +- macOS. It won't work on Windows or Linux. +- Xcode. The version ``14.2 (14C18)`` is known to work. Other versions may also work. +- CMake. CMake 3.25.1 is known to work. Other versions may also work. +- (Optional) iPhone or iPad. This is for testing the app on your device. + If you don't have a device, you can still run the app within a simulator on your Mac. + +.. caution:: + + If you get the following error:: + + CMake Error at toolchains/ios.toolchain.cmake:544 (get_filename_component): + get_filename_component called with incorrect number of arguments + Call Stack (most recent call first): + /usr/local/Cellar/cmake/3.29.0/share/cmake/Modules/CMakeDetermineSystem.cmake:146 (include) + CMakeLists.txt:2 (project) + + please run:: + + sudo xcode-select --install + sudo xcodebuild -license + + And then delete the build directory ``./build-ios`` and re-build. + + Please see also ``_. + +Download sherpa-onnx +-------------------- + +First, let us download the source code of `sherpa-onnx`_. + +.. note:: + + In the following, I will download `sherpa-onnx`_ to + ``$HOME/open-source``, i.e., ``/Users/fangjun/open-source``, on my Mac. + + You can put it anywhere as you like. + +.. code-block:: bash + + mkdir -p $HOME/open-source + cd $HOME/open-source + git clone https://github.com/k2-fsa/sherpa-onnx + +Build sherpa-onnx (in commandline, C++ Part) +-------------------------------------------- + +After downloading `sherpa-onnx`_, let us build the C++ part of `sherpa-onnx`_. + +.. code-block:: bash + + cd $HOME/open-source/sherpa-onnx/ + ./build-ios.sh + +It will generate a directory +``$HOME/open-source/sherpa-onnx/build-ios``, which we have already pre-configured +for you in Xcode. + +Build sherpa-onnx (in Xcode) +---------------------------- + +Use the following command to open `sherpa-onnx`_ in Xcode: + +.. code-block:: bash + + cd $HOME/open-source/sherpa-onnx/ios-swift/SherpaOnnx + open SherpaOnnx.xcodeproj + +It will start Xcode and you will see the following screenshot: + + .. figure:: ./pic/start-xcode-for-sherpa-onnx.png + :alt: Screenshot after running the command ``open SherpaOnnx.xcodeproj`` + :width: 600 + :align: center + + Screenshot after running the command ``open SherpaOnnx.xcodeproj`` + +Please select ``Product -> Build`` to build the project. See the screenshot +below: + + .. figure:: ./pic/select-product-build.png + :alt: Screenshot for selecting ``Product -> Build`` + :width: 600 + :align: center + + Screenshot for selecting ``Product -> Build`` + +After finishing the build, you should see the following screenshot: + + + .. figure:: ./pic/after-finishing-build.png + :alt: Screenshot after finishing the build. + :width: 100 + :align: center + + Screenshot after finishing the build. + +Congratulations! You have successfully built the project. Let us run the +project by selecting ``Product -> Run``, which is shown in the following +screenshot: + + .. figure:: ./pic/run-the-project.png + :alt: Screenshot for ``Product -> Run``. + :width: 600 + :align: center + + Screenshot for ``Product -> Run``. + +Please wait for a few seconds before Xcode starts the simulator. + +Unfortunately, it will throw the following error: + + .. figure:: ./pic/error-no-model.png + :alt: Screenshot for the error + :width: 600 + :align: center + + Screenshot for the error + +The reason for the above error is that we have not provided the pre-trained +model yet. + +The file `ViewController.swift `_ +pre-selects the pre-trained model to be :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20`, +shown in the screenshot below: + + .. figure:: ./pic/pre-trained-model-1.png + :alt: Screenshot for the pre-selected pre-trained model + :width: 600 + :align: center + + Screenshot for the pre-selected pre-trained model + +Let us add the pre-trained model :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20` +to Xcode. Please follow :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20` +to download it from `huggingface `_. +You can download it to any directory as you like. + +Please right click the project ``SherpaOnnx`` and select ``Add Files to "SherpaOnnx"...`` +in the popup menu, as is shown in the screenshot below: + + .. figure:: ./pic/step-to-add-pre-trained-model-1.png + :alt: Screenshot for adding files to SherpaOnnx + :width: 600 + :align: center + + Screenshot for adding files to SherpaOnnx + +In the popup dialog, switch to the folder where you just downloaded the pre-trained +model. + +In the screenshot below, it is the folder +``/Users/fangjun/open-source/icefall-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20``: + + .. figure:: ./pic/step-to-add-pre-trained-model-2.png + :alt: Screenshot for navigating to the folder containing the downloaded pre-trained + :width: 600 + :align: center + + Screenshot for navigating to the folder containing the downloaded pre-trained + +Select required files and click the button ``Add``: + + .. figure:: ./pic/step-to-add-pre-trained-model-3.png + :alt: Screenshot for selecting required files + :width: 600 + :align: center + + Screenshot for selecting required files + +After adding pre-trained model files to Xcode, you should see the following +screenshot: + + .. figure:: ./pic/step-to-add-pre-trained-model-4.png + :alt: Screenshot after add pre-trained model files + :width: 600 + :align: center + + Screenshot after add pre-trained model files + +At this point, you should be able to select the menu ``Product -> Run`` +to run the project and you should finally see the following screenshot: + + .. figure:: ./pic/run.png + :alt: Screenshot for a successful run. + :width: 600 + :align: center + + Screenshot for a successful run. + +Click the button to start recording! A screenshot is given below: + + .. figure:: ./pic/run-2.png + :alt: Screenshot for recording and recognition. + :width: 600 + :align: center + + Screenshot for recording and recognition. + +Congratulations! You have finally succeeded in running `sherpa-onnx`_ with iOS, +though it is in a simulator. + +Please read below if you want to run `sherpa-onnx`_ on your iPhone or iPad. + +Run sherpa-onnx on your iPhone/iPad +----------------------------------- + +First, please make sure the iOS version of your iPhone/iPad is ``>= 13.0``. + +Click the menu ``Xcode -> Settings...``, as is shown in the following screenshot: + + .. figure:: ./pic/xcode-settings.png + :alt: Screenshot for ``Xcode -> Settings...`` + :width: 600 + :align: center + + Screenshot for ``Xcode -> Settings...`` + +In the popup dialog, please select ``Account`` and click ``+`` to add +your Apple ID, as is shown in the following ``screenshots``. + + .. figure:: ./pic/add-an-account.png + :alt: Screenshot for selecting ``Account`` and click ``+``. + :width: 600 + :align: center + + Screenshot for selecting ``Account`` and click ``+``. + + .. figure:: ./pic/add-an-account-2.png + :alt: Screenshot for selecting ``Apple ID`` and click ``Continue`` + :width: 600 + :align: center + + Screenshot for selecting ``Apple ID`` and click ``Continue`` + + .. figure:: ./pic/add-an-account-3.png + :alt: Screenshot for adding your Apple ID and click ``Next`` + :width: 600 + :align: center + + Screenshot for adding your Apple ID and click ``Next`` + + .. figure:: ./pic/add-an-account-4.png + :alt: Screenshot for entering your password and click ``Next`` + :width: 600 + :align: center + + Screenshot for entering your password and click ``Next`` + + .. figure:: ./pic/add-an-account-5.png + :alt: Screenshot after adding your Apple ID + :width: 600 + :align: center + + Screenshot after adding your Apple ID + +After adding your Apple ID, please connect your iPhone or iPad to your Mac +and select your device in Xcode. The following screenshot is an example +to select my iPhone. + + .. figure:: ./pic/select-device.png + :alt: Screenshot for selecting your device + :width: 600 + :align: center + + Screenshot for selecting your device + +Now your Xcode should look like below after selecting a device: + + .. figure:: ./pic/select-device-2.png + :alt: Screenshot after selecting your device + :width: 600 + :align: center + + Screenshot after selecting your device + +Please select ``Product -> Run`` again to run `sherpa-onnx`_ on your selected +device, as is shown in the following screenshot: + + .. figure:: ./pic/run-3.png + :alt: Screenshot for selecting ``Product -> Run`` + :width: 600 + :align: center + + Screenshot for selecting ``Product -> Run`` + +After a successful build, check your iPhone/iPad and you should see the following +screenshot: + + .. figure:: ./pic/run-4.png + :alt: Screenshot for running sherpa-onnx on your device + :width: 300 + :align: center + + Screenshot for running sherpa-onnx on your device + +At this point, you should be able to run the app on your device. The following is a screenshot +about running it on my iPhone: + + .. figure:: ./pic/run-5.png + :alt: Screenshot for running `sherpa-onnx`_ on iPhone + :width: 300 + :align: center + + Screenshot for running `sherpa-onnx`_ on iPhone + + +Congratulations! You have successfully run `sherpa-onnx`_ on your device! diff --git a/docs/source/onnx/ios/index.rst b/docs/source/onnx/ios/index.rst new file mode 100644 index 000000000..923364f38 --- /dev/null +++ b/docs/source/onnx/ios/index.rst @@ -0,0 +1,19 @@ +.. _sherpa-onnx-ios: + +iOS +=== + +In this section, we describe how to build an iOS app for ``real-time`` speech +recognition with `sherpa-onnx`_ and run it within a simulator on your Mac, +run it on you iPhone or iPad. + +.. hint:: + + During speech recognition, it does not need to access the Internet. + Everyting is processed locally on your device. + +.. toctree:: + :maxdepth: 3 + + build-sherpa-onnx-swift + diff --git a/docs/source/onnx/ios/pic/add-an-account-2.png b/docs/source/onnx/ios/pic/add-an-account-2.png new file mode 120000 index 000000000..b960d99a0 --- /dev/null +++ b/docs/source/onnx/ios/pic/add-an-account-2.png @@ -0,0 +1 @@ +../../../ncnn/ios/pic/add-an-account-2.png \ No newline at end of file diff --git a/docs/source/onnx/ios/pic/add-an-account-3.png b/docs/source/onnx/ios/pic/add-an-account-3.png new file mode 120000 index 000000000..a22e69a90 --- /dev/null +++ b/docs/source/onnx/ios/pic/add-an-account-3.png @@ -0,0 +1 @@ +../../../ncnn/ios/pic/add-an-account-3.png \ No newline at end of file diff --git a/docs/source/onnx/ios/pic/add-an-account-4.png b/docs/source/onnx/ios/pic/add-an-account-4.png new file mode 120000 index 000000000..cb388dd1b --- /dev/null +++ b/docs/source/onnx/ios/pic/add-an-account-4.png @@ -0,0 +1 @@ +../../../ncnn/ios/pic/add-an-account-4.png \ No newline at end of file diff --git a/docs/source/onnx/ios/pic/add-an-account-5.png b/docs/source/onnx/ios/pic/add-an-account-5.png new file mode 120000 index 000000000..16db24bcf --- /dev/null +++ b/docs/source/onnx/ios/pic/add-an-account-5.png @@ -0,0 +1 @@ +../../../ncnn/ios/pic/add-an-account-5.png \ No newline at end of file diff --git a/docs/source/onnx/ios/pic/add-an-account.png b/docs/source/onnx/ios/pic/add-an-account.png new file mode 120000 index 000000000..10efdb045 --- /dev/null +++ b/docs/source/onnx/ios/pic/add-an-account.png @@ -0,0 +1 @@ +../../../ncnn/ios/pic/add-an-account.png \ No newline at end of file diff --git a/docs/source/onnx/ios/pic/after-finishing-build.png b/docs/source/onnx/ios/pic/after-finishing-build.png new file mode 120000 index 000000000..d6a93af42 --- /dev/null +++ b/docs/source/onnx/ios/pic/after-finishing-build.png @@ -0,0 +1 @@ +../../../ncnn/ios/pic/after-finishing-build.png \ No newline at end of file diff --git a/docs/source/onnx/ios/pic/error-no-model.png b/docs/source/onnx/ios/pic/error-no-model.png new file mode 100644 index 000000000..72c50e4e8 Binary files /dev/null and b/docs/source/onnx/ios/pic/error-no-model.png differ diff --git a/docs/source/onnx/ios/pic/pre-trained-model-1.png b/docs/source/onnx/ios/pic/pre-trained-model-1.png new file mode 100644 index 000000000..6ed0c33b5 Binary files /dev/null and b/docs/source/onnx/ios/pic/pre-trained-model-1.png differ diff --git a/docs/source/onnx/ios/pic/run-2.png b/docs/source/onnx/ios/pic/run-2.png new file mode 100644 index 000000000..09484879d Binary files /dev/null and b/docs/source/onnx/ios/pic/run-2.png differ diff --git a/docs/source/onnx/ios/pic/run-3.png b/docs/source/onnx/ios/pic/run-3.png new file mode 100644 index 000000000..8a8813c3c Binary files /dev/null and b/docs/source/onnx/ios/pic/run-3.png differ diff --git a/docs/source/onnx/ios/pic/run-4.png b/docs/source/onnx/ios/pic/run-4.png new file mode 100644 index 000000000..f8ab9f76f Binary files /dev/null and b/docs/source/onnx/ios/pic/run-4.png differ diff --git a/docs/source/onnx/ios/pic/run-5.png b/docs/source/onnx/ios/pic/run-5.png new file mode 100644 index 000000000..b53fc71a4 Binary files /dev/null and b/docs/source/onnx/ios/pic/run-5.png differ diff --git a/docs/source/onnx/ios/pic/run-the-project.png b/docs/source/onnx/ios/pic/run-the-project.png new file mode 100644 index 000000000..6d97656d8 Binary files /dev/null and b/docs/source/onnx/ios/pic/run-the-project.png differ diff --git a/docs/source/onnx/ios/pic/run.png b/docs/source/onnx/ios/pic/run.png new file mode 100644 index 000000000..6d675309e Binary files /dev/null and b/docs/source/onnx/ios/pic/run.png differ diff --git a/docs/source/onnx/ios/pic/select-device-2.png b/docs/source/onnx/ios/pic/select-device-2.png new file mode 100644 index 000000000..fffbbc6e4 Binary files /dev/null and b/docs/source/onnx/ios/pic/select-device-2.png differ diff --git a/docs/source/onnx/ios/pic/select-device.png b/docs/source/onnx/ios/pic/select-device.png new file mode 100644 index 000000000..035d00796 Binary files /dev/null and b/docs/source/onnx/ios/pic/select-device.png differ diff --git a/docs/source/onnx/ios/pic/select-product-build.png b/docs/source/onnx/ios/pic/select-product-build.png new file mode 100644 index 000000000..d9146e50e Binary files /dev/null and b/docs/source/onnx/ios/pic/select-product-build.png differ diff --git a/docs/source/onnx/ios/pic/start-xcode-for-sherpa-onnx.png b/docs/source/onnx/ios/pic/start-xcode-for-sherpa-onnx.png new file mode 100644 index 000000000..021d11381 Binary files /dev/null and b/docs/source/onnx/ios/pic/start-xcode-for-sherpa-onnx.png differ diff --git a/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-1.png b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-1.png new file mode 100644 index 000000000..a5ec28ee3 Binary files /dev/null and b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-1.png differ diff --git a/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-2.png b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-2.png new file mode 100644 index 000000000..0b6a119af Binary files /dev/null and b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-2.png differ diff --git a/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-3.png b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-3.png new file mode 100644 index 000000000..ca1fac5d1 Binary files /dev/null and b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-3.png differ diff --git a/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-4.png b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-4.png new file mode 100644 index 000000000..84b12c375 Binary files /dev/null and b/docs/source/onnx/ios/pic/step-to-add-pre-trained-model-4.png differ diff --git a/docs/source/onnx/ios/pic/xcode-settings.png b/docs/source/onnx/ios/pic/xcode-settings.png new file mode 100644 index 000000000..e9f868005 Binary files /dev/null and b/docs/source/onnx/ios/pic/xcode-settings.png differ diff --git a/docs/source/onnx/java-api/build-jar.rst b/docs/source/onnx/java-api/build-jar.rst new file mode 100644 index 000000000..f758a8dba --- /dev/null +++ b/docs/source/onnx/java-api/build-jar.rst @@ -0,0 +1,68 @@ +Build the jar package +===================== + +.. note:: + + Please see the end of this page for how to download pre-built ``jar``. + +.. code-block:: bash + + cd sherpa-onnx/sherpa-onnx/java-api + ls -lh + +You should see the following output:: + + (py311) fangjun@ubuntu23-04:/mnt/sdb/shared/sherpa-onnx/sherpa-onnx/java-api$ ls -lh + + total 8.0K + -rw-rw-r-- 1 fangjun fangjun 2.5K May 8 06:17 Makefile + drwxrwxr-x 3 fangjun fangjun 4.0K Mar 1 04:29 src + +Please run the following command in the directory ``sherpa-onnx/java-api``: + +.. code-block:: bash + + make + +You should see the following output after running ``make``:: + + (py311) fangjun@ubuntu23-04:/mnt/sdb/shared/sherpa-onnx/sherpa-onnx/java-api$ ls -lh + total 12K + drwxrwxr-x 3 fangjun fangjun 4.0K May 15 03:59 build + -rw-rw-r-- 1 fangjun fangjun 2.5K May 8 06:17 Makefile + drwxrwxr-x 3 fangjun fangjun 4.0K Mar 1 04:29 src + (py311) fangjun@ubuntu23-04:/mnt/sdb/shared/sherpa-onnx/sherpa-onnx/java-api$ ls -lh build/ + total 60K + drwxrwxr-x 3 fangjun fangjun 4.0K May 15 03:58 com + -rw-rw-r-- 1 fangjun fangjun 53K May 15 03:59 sherpa-onnx.jar + +Congratulations! You have generated ``sherpa-onnx.jar`` successfully. + +.. hint:: + + You can find the Java API source files at + + ``_ + +Download pre-built jar +---------------------- + +If you don't want to build ``jar`` by yourself, you can download pre-built ``jar`` from +from + + ``_ + +For Chinese users, please use + + ``_ + +Please always use the latest version. In the following, we describe how to download +the version ``1.10.2``. + +.. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.2.jar + + # For Chinese users + # wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.2.jar + diff --git a/docs/source/onnx/java-api/build-jni-linux.rst b/docs/source/onnx/java-api/build-jni-linux.rst new file mode 100644 index 000000000..fee44c03e --- /dev/null +++ b/docs/source/onnx/java-api/build-jni-linux.rst @@ -0,0 +1,146 @@ +.. _sherpa-onnx-jni-linux-build: + +Build JNI interface (Linux) +=========================== + +In the following, we describe how to build the JNI interface for Linux. +It is applicable for both Linux x64 and arm64. + +For macOS users, please refer to :ref:`sherpa-onnx-jni-macos-build` + +.. hint:: + + For Windows users, you have to modify the commands by yourself. + +Setup the environment +--------------------- + +Make sure you have the following two items ready: + + - a working C/C++ compiler that supports C++17 + - you are able to run ``java`` and ``javac`` commands in your terminal. + +To check your environment, please run: + +.. code-block:: bash + + gcc --version + java -version + javac -version + +The above three commands print the following output on my computer. You don't need +to use the exact versions as I am using. + +.. code-block:: + + # output of gcc --version + + gcc (Ubuntu 12.3.0-1ubuntu1~23.04) 12.3.0 + Copyright (C) 2022 Free Software Foundation, Inc. + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + # output of java -version + + java version "17.0.11" 2024-04-16 LTS + Java(TM) SE Runtime Environment (build 17.0.11+7-LTS-207) + Java HotSpot(TM) 64-Bit Server VM (build 17.0.11+7-LTS-207, mixed mode, sharing) + + # output of javac -version + + javac 17.0.11 + +Build sherpa-onnx +----------------- + +Please use the following commands to build `sherpa-onnx`_: + +.. code-block:: + + git clone https://github.com/k2-fsa/sherpa-onnx + + cd sherpa-onnx + + mkdir build + + cd build + + # If you want to enable GPU support, please + # set OFF to ON + SHERPA_ONNX_ENABLE_GPU=OFF + + cmake \ + -DSHERPA_ONNX_ENABLE_GPU=$SHERPA_ONNX_ENABLE_GPU \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + + # Remove unused libs + rm lib/lib*.a + rm lib/libcargs.so + + # You don't need it for jni + rm lib/libsherpa-onnx-c-api.so + + ls -lh lib + +You should see the following output for ``ls -lh lib``:: + + total 4.0M + -rwxrwxr-x 1 fangjun fangjun 4.0M Aug 29 00:56 libsherpa-onnx-jni.so + +``libsherpa-onnx-jni.so`` contains the JNI interface for `sherpa-onnx`_. + +.. hint:: + + You can find ``libonnxruntime.so`` by running:: + + fangjun@ubuntu23-04:~/sherpa-onnx/build$ ls _deps/onnxruntime-src/lib/ + libonnxruntime.so + +Download pre-built JNI libs +--------------------------- + +If you don't want to build ``JNI`` libs by yourself, please download pre-built ``JNI`` +libs from + + ``_ + +For Chinese users, please use + + ``_ + +Please always use the latest version. In the following, we describe how to download +the version ``1.10.23``. + +.. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-linux-x64-jni.tar.bz2 + + # For Chinese users + # wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-linux-x64-jni.tar.bz2 + + tar xf sherpa-onnx-v1.10.23-linux-x64-jni.tar.bz2 + rm sherpa-onnx-v1.10.23-linux-x64-jni.tar.bz2 + +.. note:: + + You can also download it from + + ``_ + +You should find the following files: + +.. code-block:: bash + + ls -lh sherpa-onnx-v1.10.23-linux-x64-jni/lib/ + + total 19M + -rw-r--r-- 1 fangjun fangjun 15M Aug 24 22:18 libonnxruntime.so + -rwxr-xr-x 1 fangjun fangjun 4.2M Aug 24 22:25 libsherpa-onnx-jni.so diff --git a/docs/source/onnx/java-api/build-jni-macos.rst b/docs/source/onnx/java-api/build-jni-macos.rst new file mode 100644 index 000000000..788f5096f --- /dev/null +++ b/docs/source/onnx/java-api/build-jni-macos.rst @@ -0,0 +1,170 @@ +.. _sherpa-onnx-jni-macos-build: + +Build JNI interface (macOS) +=========================== + +In the following, we describe how to build the JNI interface for macOS. +It is applicable for both macOS x64 and arm64. + +For Linux users, please refer to :ref:`sherpa-onnx-jni-linux-build` + +.. hint:: + + For Windows users, you have to modify the commands by yourself. + +Setup the environment +--------------------- + +Make sure you have the following two items ready: + + - a working C/C++ compiler that supports C++17 + - you are able to run ``java`` and ``javac`` commands in your terminal. + +To check your environment, please run: + +.. code-block:: bash + + gcc --version + java -version + javac -version + +The above three commands print the following output on my computer. You don't need +to use the exact versions as I am using. + +.. code-block:: + + # output of gcc --version + + Apple clang version 14.0.0 (clang-1400.0.29.202) + Target: x86_64-apple-darwin22.2.0 + Thread model: posix + InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin + + # output of java -version + + openjdk version "19.0.1" 2022-10-18 + OpenJDK Runtime Environment (build 19.0.1+10-21) + OpenJDK 64-Bit Server VM (build 19.0.1+10-21, mixed mode, sharing) + + # output of javac -version + + javac 19.0.1 + +Build sherpa-onnx +----------------- + +Please use the following commands to build `sherpa-onnx`_: + +.. code-block:: + + git clone https://github.com/k2-fsa/sherpa-onnx + + cd sherpa-onnx + + mkdir build + + cd build + + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + + # Remove unused files + rm lib/lib*.a + rm lib/libcargs.dylib + rm lib/libsherpa-onnx-c-api.dylib + + ls -lh lib + +You should see the following output for ``ls -lh lib``:: + + total 8024 + -rwxr-xr-x 1 fangjun staff 3.9M Aug 18 19:34 libsherpa-onnx-jni.dylib + +``libsherpa-onnx-jni.dylib`` contains the JNI interface for `sherpa-onnx`_. + +.. hint:: + + You can find ``libonnxruntime.dylib`` by running:: + + fangjuns-MacBook-Pro:build fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/build + + fangjuns-MacBook-Pro:build fangjun$ ls -lh _deps/onnxruntime-src/lib/ + total 51664 + -rwxr-xr-x 1 fangjun staff 25M Aug 14 14:09 libonnxruntime.1.17.1.dylib + drwxr-xr-x 3 fangjun staff 96B Aug 14 14:09 libonnxruntime.1.17.1.dylib.dSYM + lrwxr-xr-x 1 fangjun staff 27B Aug 14 14:09 libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + + +Download pre-built JNI libs +--------------------------- + +If you don't want to build ``JNI`` libs by yourself, please download pre-built ``JNI`` +libs from + + ``_ + +For Chinese users, please use + + ``_ + +Please always use the latest version. In the following, we describe how to download +the version ``1.10.23``. + +.. tabs:: + + .. tab:: Intel CPU (x86_64) + + .. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-osx-x86_64-jni.tar.bz2 + + # For Chinese users + # wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-osx-x86_64-jni.tar.bz2 + + tar xf sherpa-onnx-v1.10.23-osx-x86_64-jni.tar.bz2 + rm sherpa-onnx-v1.10.23-osx-x86_64-jni.tar.bz2 + + .. tab:: Apple Silicon (arm64) + + .. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-osx-arm64-jni.tar.bz2 + + # For Chinese users + # wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-osx-arm64-jni.tar.bz2 + + tar xf sherpa-onnx-v1.10.23-osx-arm64-jni.tar.bz2 + rm sherpa-onnx-v1.10.23-osx-arm64-jni.tar.bz2 + +.. note:: + + You can also download it from + + ``_ + +After downloading, you should see the following files: + +.. code-block:: bash + + # For x86_64 + ls -lh sherpa-onnx-v1.10.23-osx-x86_64-jni/lib + total 30M + -rw-r--r-- 1 fangjun fangjun 26M Aug 25 00:31 libonnxruntime.1.17.1.dylib + lrwxrwxrwx 1 fangjun fangjun 27 Aug 25 00:35 libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + -rwxr-xr-x 1 fangjun fangjun 3.9M Aug 25 00:35 libsherpa-onnx-jni.dylib + + # For arm64 + ls -lh sherpa-onnx-v1.10.23-osx-arm64-jni/lib/ + total 27M + -rw-r--r-- 1 fangjun fangjun 23M Aug 24 23:56 libonnxruntime.1.17.1.dylib + lrwxrwxrwx 1 fangjun fangjun 27 Aug 24 23:59 libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + -rwxr-xr-x 1 fangjun fangjun 3.6M Aug 24 23:59 libsherpa-onnx-jni.dylib diff --git a/docs/source/onnx/java-api/build-jni-windows.rst b/docs/source/onnx/java-api/build-jni-windows.rst new file mode 100644 index 000000000..5e5e12395 --- /dev/null +++ b/docs/source/onnx/java-api/build-jni-windows.rst @@ -0,0 +1,58 @@ +.. _sherpa-onnx-jni-windows-build: + +Build JNI interface (Windows) +============================= + +If you want to build ``JNI`` libs by yourself, please see ``_. + +.. hint:: + + The PDFs in the above link are in Chinese. + +If you want to download pre-built ``JNI`` libs, please see below. + +Download pre-built JNI libs +--------------------------- + +If you don't want to build ``JNI`` libs by yourself, please download pre-built ``JNI`` +libs from + + ``_ + +For Chinese users, please use + + ``_ + +Please always use the latest version. In the following, we describe how to download +the version ``1.10.23``. + +.. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-win-x64-jni.tar.bz2 + + # For Chinese users + # wget https://hf-mirror.com/csukuangfj/sherpa-onnx-libs/resolve/main/jni/sherpa-onnx-v1.10.23-win-x64-jni.tar.bz2 + + tar xf sherpa-onnx-v1.10.23-win-x64-jni.tar.bz2 + rm sherpa-onnx-v1.10.23-win-x64-jni.tar.bz2 + +You should find the following files: + +.. code-block:: bash + + ls -lh sherpa-onnx-v1.10.23-win-x64-jni/lib/ + total 14M + -rwxr-xr-x 1 fangjun fangjun 11M Aug 24 15:41 onnxruntime.dll + -rwxr-xr-x 1 fangjun fangjun 23K Aug 24 15:41 onnxruntime_providers_shared.dll + -rwxr-xr-x 1 fangjun fangjun 3.1M Aug 24 15:48 sherpa-onnx-jni.dll + -rw-r--r-- 1 fangjun fangjun 51K Aug 24 15:47 sherpa-onnx-jni.lib + +.. hint:: + + Only ``*.dll`` files are needed during runtime. + +.. note:: + + You can also download it from + + ``_ diff --git a/docs/source/onnx/java-api/examples.rst b/docs/source/onnx/java-api/examples.rst new file mode 100644 index 000000000..66e1a1a0d --- /dev/null +++ b/docs/source/onnx/java-api/examples.rst @@ -0,0 +1,8 @@ +Examples +======== + +Please see ``_ + +You can find detailed instructions at + + ``_ diff --git a/docs/source/onnx/java-api/index.rst b/docs/source/onnx/java-api/index.rst new file mode 100644 index 000000000..d51cd5880 --- /dev/null +++ b/docs/source/onnx/java-api/index.rst @@ -0,0 +1,31 @@ +.. _sherpa-onnx-java-api: + +Java API +======== + +In this section, we describe how to use the ``Java`` API of `sherpa-onnx`_. + +The core part of `sherpa-onnx`_ is written in C++. We have provided +`JNI `_ +interface for `sherpa-onnx`_ so that you can use it in Java. + +Before using the Java API of `sherpa-onnx`_, you have to build the ``JNI`` interface. + +.. hint:: + + We provide pre-built ``JNI`` libs. Please see + + ``_ + + For Chinese users, please use + + ``_ + +.. toctree:: + :maxdepth: 5 + + ./build-jni-macos.rst + ./build-jni-linux.rst + ./build-jni-windows.rst + ./build-jar.rst + ./examples.rst diff --git a/docs/source/onnx/javascript-api/examples.rst b/docs/source/onnx/javascript-api/examples.rst new file mode 100644 index 000000000..28865798a --- /dev/null +++ b/docs/source/onnx/javascript-api/examples.rst @@ -0,0 +1,9 @@ +Examples +======== + + +Please see ``_ + +You can find detailed instructions at + + ``_ diff --git a/docs/source/onnx/javascript-api/index.rst b/docs/source/onnx/javascript-api/index.rst new file mode 100644 index 000000000..a04b22bef --- /dev/null +++ b/docs/source/onnx/javascript-api/index.rst @@ -0,0 +1,14 @@ +.. _sherpa-onnx-javascript-api: + +Javascript API +============== + +For using Javascript in the browser, please see our :ref:`sherpa-onnx-wasm` doc. + +This section describes how to use `sherpa-onnx`_ in Node with Javascript API. + +.. toctree:: + :maxdepth: 5 + + ./install.rst + ./examples.rst diff --git a/docs/source/onnx/javascript-api/install.rst b/docs/source/onnx/javascript-api/install.rst new file mode 100644 index 000000000..cce00cc0c --- /dev/null +++ b/docs/source/onnx/javascript-api/install.rst @@ -0,0 +1,33 @@ +Install +======= + +We provide npm packages for `sherpa-onnx`_. + +It can be found at + + ``_ + +.. hint:: + + It requires ``Node>=v16``. + +Please always use the latest version. + +To install it, please run:: + + npm install sherpa-onnx-node + +It supports the following platforms: + + - Linux x64 + - Linux arm64 + - macOS x64 + - macOS arm64 + - Windows x64 + +.. hint:: + + You don't need to pre-install anything in order to install ``sherpa-onnx-node``. + + That is, you don't need to install a C/C++ compiler. You don't need to install Python. + You don't need to install CMake, etc. diff --git a/docs/source/onnx/kotlin-api/build-jni.rst b/docs/source/onnx/kotlin-api/build-jni.rst new file mode 100644 index 000000000..6c27d84e1 --- /dev/null +++ b/docs/source/onnx/kotlin-api/build-jni.rst @@ -0,0 +1,12 @@ +.. _sherpa-onnx-jni-build-kotlin: + +Build JNI interface +=================== + +For macOS users, please refer to :ref:`sherpa-onnx-jni-macos-build`. + +For Linux users, please refer to :ref:`sherpa-onnx-jni-linux-build`. + +.. hint:: + + For Windows users, you have to modify the commands by yourself. diff --git a/docs/source/onnx/kotlin-api/examples.rst b/docs/source/onnx/kotlin-api/examples.rst new file mode 100644 index 000000000..fb28d3049 --- /dev/null +++ b/docs/source/onnx/kotlin-api/examples.rst @@ -0,0 +1,8 @@ +Examples +======== + +Please see ``_ + +You can find detailed instructions at + + ``_ diff --git a/docs/source/onnx/kotlin-api/index.rst b/docs/source/onnx/kotlin-api/index.rst new file mode 100644 index 000000000..d710ec32a --- /dev/null +++ b/docs/source/onnx/kotlin-api/index.rst @@ -0,0 +1,18 @@ +.. _sherpa-onnx-kotlin-api: + +Kotlin API +========== + +In this section, we describe how to use the ``Kotlin`` API of `sherpa-onnx`_. + +The core part of `sherpa-onnx`_ is written in C++. We have provided +`JNI `_ +interface for `sherpa-onnx`_ so that you can use it in Kotlin. + +Before using the Kotlin API of `sherpa-onnx`_, you have to build the ``JNI`` interface. + +.. toctree:: + :maxdepth: 5 + + ./build-jni.rst + ./examples.rst diff --git a/docs/source/onnx/kws/index.rst b/docs/source/onnx/kws/index.rst new file mode 100644 index 000000000..fe610b7b2 --- /dev/null +++ b/docs/source/onnx/kws/index.rst @@ -0,0 +1,135 @@ +.. _sherpa-onnx-keyword-spotting: + +Keyword spotting +================ + +In this section, we describe how we implement the open vocabulary keyword spotting (aka customized keyword spotting) +feature and how to use it in `sherpa-onnx`_. + +What is open vocabulary keyword spotting +---------------------------------------- + +Basically, an open vocabulary keyword spotting system is just like a tiny ASR system, but it can only decode words/phrases +in the given keywords. For example, if the given keyword is ``HELLO WORLD``, then the decoded result should be either +``HELLO WORLD`` or empty. As for open vocabulary (or customized), it means you can specify any keywords without re-training +the model. For building a conventional keyword spotting systems, people need to prepare a lot of audio-text pairs for the selected keywords +and the trained model can only be used to detect those selected keywords. +While an open vocabulary keyword spotting system allows people using one system to detect different keywords, even the keywords +might not be in the training data. + + +Decoder for open vocabulary keyword spotting +-------------------------------------------- + +For now, we only implement a beam search decoder to make the system only trigger the given keywords (i.e. the model itself is actually a tiny ASR). +To make it is able to balance between the ``trigged rate`` and ``false alarm``, we introduce two parameters for each keyword, ``boosting score`` +and ``trigger threshold``. The ``boosting score`` works like the hotwords recognition, it help the paths containing keywords to survive beam +search, the larger this score is the easier the corresponding keyword will be triggered, read :ref:`sherpa-onnx-hotwords` for more details. +The ``trigger threshold`` defines the minimum acoustic probability of decoded sequences (token sequences) that can be triggered, it is a float +value between 0 to 1, the lower this threshold is the easier the corresponding keyword will be triggered. + +Keywords file +------------- + +The input keywords looks like (the keywords are ``HELLO WORLD``, ``HI GOOGLE`` and ``HEY SIRI``): + +.. code-block:: + + ▁HE LL O ▁WORLD :1.5 #0.35 + ▁HI ▁GO O G LE :1.0 #0.25 + ▁HE Y ▁S I RI + +Each line contains a keyword, the first several tokens (separated by spaces) are encoded tokens of the keyword, the item starts with ``:`` is the ``boosting score`` and the item starts with ``#`` is the ``trigger threshold``. Note: No spaces between ``:`` (or ``#``) and the float value. + +To get the tokens you need to use the command line tool in `sherpa-onnx`_ to convert the original keywords, you can see the +usage as follows: + +.. code-block:: + + # Note: You need to run pip install sherpa-onnx to get the commandline tool: sherpa-onnx-cli + + + sherpa-onnx-cli text2token --help + Usage: sherpa-onnx-cli text2token [OPTIONS] INPUT OUTPUT + + Options: + + --text TEXT Path to the input texts. Each line in the texts contains the original phrase, it might also contain some extra items, + for example, the boosting score (startting with :), the triggering threshold + (startting with #, only used in keyword spotting task) and the original phrase (startting with @). + Note: extra items will be kept in the output. + + example input 1 (tokens_type = ppinyin): + 小爱同学 :2.0 #0.6 @小爱同学 + 你好问问 :3.5 @你好问问 + 小艺小艺 #0.6 @小艺小艺 + example output 1: + x iǎo ài tóng x ué :2.0 #0.6 @小爱同学 + n ǐ h ǎo w èn w èn :3.5 @你好问问 + x iǎo y ì x iǎo y ì #0.6 @小艺小艺 + + example input 2 (tokens_type = bpe): + HELLO WORLD :1.5 #0.4 + HI GOOGLE :2.0 #0.8 + HEY SIRI #0.35 + example output 2: + ▁HE LL O ▁WORLD :1.5 #0.4 + ▁HI ▁GO O G LE :2.0 #0.8 + ▁HE Y ▁S I RI #0.35 + + --tokens TEXT The path to tokens.txt. + --tokens-type TEXT The type of modeling units, should be cjkchar, bpe, cjkchar+bpe, fpinyin or ppinyin. + fpinyin means full pinyin, each cjkchar has a pinyin(with tone). ppinyin + means partial pinyin, it splits pinyin into initial and final, + --bpe-model TEXT The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe. + --help Show this message and exit. + +.. note:: + + If the tokens-type is ``fpinyin`` or ``ppinyin``, you MUST provide the original keyword (starting with ``@``). + +.. note:: + + If you install sherpa-onnx from sources (i.e. not by pip), you can use the + `alternative script `_ + in `scripts `_, + the usage is almost the same as the command + line tool, read the help information by: + + .. code-block:: + + python3 scripts/text2token.py --help + + +How to use keyword spotting in sherpa-onnx +------------------------------------------ + +Currently, we provide command-line tool and android app for keyword spotting. + + +command-line tool +~~~~~~~~~~~~~~~~~ + +After installing `sherpa-onnx`_, type ``sherpa-onnx-keyword-spotter --help`` for the help message. + + + +Android application +------------------- + +You can find pre-built Android APKs for keyword spotting at + + ``_ + +Here is a demo video (Note: It is in Chinese). + +.. raw:: html + + + + + +Pretrained models +----------------- + +You can find the pre-trained models in :ref:`sherpa-onnx-kws-pre-trained-models`. diff --git a/docs/source/onnx/kws/pretrained_models/index.rst b/docs/source/onnx/kws/pretrained_models/index.rst new file mode 100644 index 000000000..ad0ba43bc --- /dev/null +++ b/docs/source/onnx/kws/pretrained_models/index.rst @@ -0,0 +1,416 @@ +.. _sherpa-onnx-kws-pre-trained-models: + +Pre-trained models +================== + +In this section, we describe how to download and use all +available keyword spotting pre-trained models. + +.. hint:: + + Please install `git-lfs `_ before you continue. + + Otherwise, you will be ``SAD`` later. + + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + + +sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 (Chinese) +------------------------------------------------------------------ + +Training code for this model can be found at ``_. +The model is trained on WenetSpeech L subset (10000 hours), it supports only Chinese. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. tabs:: + + .. tab:: Github + + .. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + ls -lh sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 + + .. tab:: ModelScope + + .. code-block:: bash + + cd /path/to/sherpa-onnx + git lfs install + git clone https://www.modelscope.cn/pkufool/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.git + ls -lh sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 + +The output is given below: + +.. code-block:: + + $ ls -lh sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 + total 18M + -rw-r--r-- 1 kangwei root 48 Jan 1 21:45 configuration.json + -rw-r--r-- 1 kangwei root 177K Jan 17 11:38 decoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx + -rw-r--r-- 1 kangwei root 660K Jan 1 21:45 decoder-epoch-12-avg-2-chunk-16-left-64.onnx + -rw-r--r-- 1 kangwei root 4.6M Jan 17 11:38 encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx + -rw-r--r-- 1 kangwei root 12M Jan 1 21:45 encoder-epoch-12-avg-2-chunk-16-left-64.onnx + -rw-r--r-- 1 kangwei root 64K Jan 17 11:38 joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx + -rw-r--r-- 1 kangwei root 248K Jan 1 21:45 joiner-epoch-12-avg-2-chunk-16-left-64.onnx + -rw-r--r-- 1 kangwei root 101 Jan 8 17:14 keywords_raw.txt + -rw-r--r-- 1 kangwei root 286 Jan 8 17:14 keywords.txt + -rw-r--r-- 1 kangwei root 750 Jan 8 17:14 README.md + drwxr-xr-x 10 kangwei root 0 Jan 15 22:52 test_wavs + -rw-r--r-- 1 kangwei root 1.6K Jan 1 21:45 tokens.txt + +Test the model +~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-keyword-spotter \ + --encoder=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx \ + --decoder=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx \ + --joiner=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx \ + --tokens=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt \ + --keywords-file=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt \ + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav \ + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/4.wav \ + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/5.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-keyword-spotter.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. code-block:: + + KeywordSpotterConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="sherpa-on$x-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx", decoder="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk$16-left-64.onnx", joiner="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", deco$er=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="sherpa-onnx-kws-zipformer-we$etspeech-3.3M-2024-01-01/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), max_active_paths=4, num_trailing_blanks=1, keywords_score=1, keywords_threshold=0.25 keywords_file="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt") + + 2024-01-19 12:32:29.983790275 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3385848, index: 15, mask: {16, 52, }, error code: 22 error msg: Invali$ + argument. Specify the number of threads explicitly so the affinity is not set. + 2024-01-19 12:32:29.983792055 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3385849, index: 16, mask: {17, 53, }, error code: 22 error msg: Invali$ + argument. Specify the number of threads explicitly so the affinity is not set. + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/4.wav + {"start_time":0.00, "keyword": "蒋友伯", "timestamps": [0.64, 0.68, 0.84, 0.96, 1.12, 1.16], "tokens":["j", "iǎng", "y", "ǒu", "b", "ó"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/5.wav + {"start_time":0.00, "keyword": "周望军", "timestamps": [0.64, 0.68, 0.76, 0.84, 1.00, 1.04], "tokens":["zh", "ōu", "w", "àng", "j", "ūn"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav + {"start_time":0.00, "keyword": "文森特卡索", "timestamps": [0.32, 0.72, 0.96, 1.00, 1.20, 1.32, 1.48, 1.60, 1.88, 1.92], "tokens":["w", "én", "s", "ēn", "t", "è", "k", "ǎ", "s", "uǒ"$ + } + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/5.wav + {"start_time":0.00, "keyword": "落实", "timestamps": [1.76, 1.92, 2.12, 2.20], "tokens":["l", "uò", "sh", "í"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/4.wav + {"start_time":0.00, "keyword": "女儿", "timestamps": [3.08, 3.20, 3.24], "tokens":["n", "ǚ", "ér"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav + {"start_time":0.00, "keyword": "法国", "timestamps": [4.56, 4.64, 4.80, 4.88], "tokens":["f", "ǎ", "g", "uó"]} + + +int8 +^^^^ + +The following code shows how to use ``int8`` models: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-keyword-spotter \ + --encoder=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx \ + --decoder=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx \ + --joiner=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx \ + --tokens=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt \ + --keywords-file=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt \ + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav \ + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/4.wav \ + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/5.wav + + +.. code-block:: + + KeywordSpotterConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx", decoder="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx", joiner="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), max_active_paths=4, num_trailing_blanks=1, keywords_score=1, keywords_threshold=0.25, keywords_file="sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt") + + 2024-01-19 12:36:44.635979490 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3391918, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. + 2024-01-19 12:36:44.635981379 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3391919, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/4.wav + {"start_time":0.00, "keyword": "蒋友伯", "timestamps": [0.64, 0.68, 0.84, 0.96, 1.12, 1.16], "tokens":["j", "iǎng", "y", "ǒu", "b", "ó"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/5.wav + {"start_time":0.00, "keyword": "周望军", "timestamps": [0.64, 0.68, 0.76, 0.84, 1.00, 1.08], "tokens":["zh", "ōu", "w", "àng", "j", "ūn"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav + {"start_time":0.00, "keyword": "文森特卡索", "timestamps": [0.32, 0.72, 0.96, 1.04, 1.28, 1.32, 1.52, 1.60, 1.92, 1.96], "tokens":["w", "én", "s", "ēn", "t", "è", "k", "ǎ", "s", "uǒ"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/5.wav + {"start_time":0.00, "keyword": "落实", "timestamps": [1.80, 1.92, 2.12, 2.20], "tokens":["l", "uò", "sh", "í"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/4.wav + {"start_time":0.00, "keyword": "女儿", "timestamps": [3.08, 3.20, 3.24], "tokens":["n", "ǚ", "ér"]} + + sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav + {"start_time":0.00, "keyword": "法国", "timestamps": [4.56, 4.64, 4.80, 4.88], "tokens":["f", "ǎ", "g", "uó"]} + + +Customize your own keywords +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To customize your own keywords, the only thing you need to do is replacing the ``--keywords-file``. The keywords file is generated as follows: + +For example your keywords are (keywords_raw.txt): + +.. code-block:: + + 你好军哥 @你好军哥 + 你好问问 @你好问问 + 小爱同学 @小爱同学 + +Run the following command: + +.. code-block:: + + sherpa-onnx-cli text2token \ + --tokens sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt \ + --tokens-type ppinyin \ + keywords_raw.txt keywords.txt + +The ``keywords.txt`` looks like: + +.. code-block:: + + n ǐ h ǎo j ūn g ē @你好军哥 + n ǐ h ǎo w èn w èn @你好问问 + x iǎo ài t óng x ué @小爱同学 + +.. note:: + + If you install sherpa-onnx from sources (i.e. not by pip), you can use the + alternative script in `scripts`, the usage is almost the same as the command + line tool, read the help information by: + + .. code-block:: + + python3 scripts/text2token.py --help + + +sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01 (English) +------------------------------------------------------------------ + +Training code for this model can be found at ``_. +The model is trained on GigaSpeech XL subset (10000 hours), it supports only English. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. tabs:: + + .. tab:: Github + + .. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01.tar.bz2 + tar xvf sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01.tar.bz2 + rm sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01.tar.bz2 + ls -lh sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01 + + .. tab:: ModelScope + + .. code-block:: bash + + cd /path/to/sherpa-onnx + git lfs install + git clone https://www.modelscope.cn/pkufool/sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01.git + ls -lh sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01 + +The output is given below: + +.. code-block:: + + $ ls -lh sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01 + total 19M + -rw-r--r-- 1 kangwei root 240K Jan 19 15:25 bpe.model + -rw-r--r-- 1 kangwei root 48 Jan 19 15:25 configuration.json + -rw-r--r-- 1 kangwei root 272K Jan 19 15:25 decoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx + -rw-r--r-- 1 kangwei root 1.1M Jan 19 15:25 decoder-epoch-12-avg-2-chunk-16-left-64.onnx + -rw-r--r-- 1 kangwei root 4.6M Jan 19 15:25 encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx + -rw-r--r-- 1 kangwei root 12M Jan 19 15:25 encoder-epoch-12-avg-2-chunk-16-left-64.onnx + -rw-r--r-- 1 kangwei root 160K Jan 19 15:25 joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx + -rw-r--r-- 1 kangwei root 628K Jan 19 15:25 joiner-epoch-12-avg-2-chunk-16-left-64.onnx + -rw-r--r-- 1 kangwei root 102 Jan 19 15:25 keywords_raw.txt + -rw-r--r-- 1 kangwei root 184 Jan 19 15:25 keywords.txt + -rw-r--r-- 1 kangwei root 743 Jan 19 15:25 README.md + drwxr-xr-x 6 kangwei root 0 Jan 19 15:25 test_wavs + -rw-r--r-- 1 kangwei root 4.9K Jan 19 15:25 tokens.txt + +Test the model +~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-keyword-spotter \ + --encoder=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx \ + --decoder=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx \ + --joiner=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx \ + --tokens=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/tokens.txt \ + --keywords-file=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt \ + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/0.wav \ + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-keyword-spotter.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. code-block:: + + KeywordSpotterConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx", decoder="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx", joiner="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), max_active_paths=4, num_trailing_blanks=1, keywords_score=1, keywords_threshold=0.25, keywords_file="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt") + 2024-01-19 15:32:46.420331393 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3492733, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. + 2024-01-19 15:32:46.420332978 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3492732, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/0.wav + {"start_time":0.00, "keyword": "LIGHT UP", "timestamps": [3.04, 3.08, 3.12, 3.20], "tokens":[" ", "L", "IGHT", " UP"]} + + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/1.wav + {"start_time":0.00, "keyword": "LOVELY CHILD", "timestamps": [5.44, 5.56, 5.84, 6.00, 6.04], "tokens":[" LOVE", "LY", " CHI", "L", "D"]} + + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/1.wav + {"start_time":0.00, "keyword": "FOREVER", "timestamps": [10.88, 11.04, 11.08], "tokens":[" FOR", "E", "VER"]} + + +int8 +^^^^ + +The following code shows how to use ``int8`` models: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-keyword-spotter \ + --encoder=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx \ + --decoder=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx \ + --joiner=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx \ + --tokens=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/tokens.txt \ + --keywords-file=sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt \ + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/0.wav \ + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/1.wav + + +.. code-block:: + + KeywordSpotterConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx", decoder="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx", joiner="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), max_active_paths=4, num_trailing_blanks=1, keywords_score=1, keywords_threshold=0.25, keywords_file="sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt") + 2024-01-19 15:31:39.743344642 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3492115, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. + 2024-01-19 15:31:39.743346583 [E:onnxruntime:, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3492116, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/0.wav + {"start_time":0.00, "keyword": "LIGHT UP", "timestamps": [3.04, 3.08, 3.12, 3.16], "tokens":[" ", "L", "IGHT", " UP"]} + + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/1.wav + {"start_time":0.00, "keyword": "LOVELY CHILD", "timestamps": [5.36, 5.60, 5.84, 6.00, 6.04], "tokens":[" LOVE", "LY", " CHI", "L", "D"]} + + sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/test_wavs/1.wav + {"start_time":0.00, "keyword": "FOREVER", "timestamps": [10.88, 11.04, 11.08], "tokens":[" FOR", "E", "VER"]} + + +Customize your own keywords +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To customize your own keywords, the only thing you need to do is replacing the ``--keywords-file``. The keywords file is generated as follows: + +For example your keywords are (keywords_raw.txt): + +.. code-block:: + + HELLO WORLD + HI GOOGLE + HEY SIRI + +Run the following command: + +.. code-block:: + + sherpa-onnx-cli text2token \ + --tokens sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/tokens.txt \ + --tokens-type bpe \ + --bpe-model sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/bpe.model \ + keywords_raw.txt keywords.txt + +The ``keywords.txt`` looks like: + +.. code-block:: + + ▁HE LL O ▁WORLD + ▁HI ▁GO O G LE + ▁HE Y ▁S I RI + +.. note:: + + If you install sherpa-onnx from sources (i.e. not by pip), you can use the + alternative script in `scripts`, the usage is almost the same as the command + line tool, read the help information by: + + .. code-block:: + + python3 scripts/text2token.py --help diff --git a/docs/source/onnx/lazarus/generate-subtitles.rst b/docs/source/onnx/lazarus/generate-subtitles.rst new file mode 100644 index 000000000..7c3d23c07 --- /dev/null +++ b/docs/source/onnx/lazarus/generate-subtitles.rst @@ -0,0 +1,507 @@ +Generate subtitles +================== + +This page describes how to run the code in the following directory: + + ``_ + +.. hint:: + + Before you continue, we assume you have installed `Lazarus`_. + +Screenshots on different platforms +---------------------------------- + +The same code can be compiled without any modifications for different operating systems +and architectures. + +That is `WOCA `_, + + Write once, compile anywhere. + +The following screenshots give an example about that. + +.. tabs:: + + .. tab:: Linux x64 screenshot + + .. figure:: ./pic/generate-subtitles/linux-x64.jpg + :alt: Windows-x64 + :width: 90% + + Linux-x64 screenshot + + + .. tab:: Windows x64 screenshot + + .. figure:: ./pic/generate-subtitles/windows-x64.jpg + :alt: Windows-x64 + :width: 90% + + Windows-x64 screenshot + + .. tab:: macOS x64 screenshot + + .. figure:: ./pic/generate-subtitles/macos-x64.jpg + :alt: macos-x64 + :width: 90% + + macOS-x64 screenshot + +Get sherpa-onnx libraries +------------------------- + +`sherpa-onnx`_ is implemented in C++. To use it with Object Pascal, we have to +get either the static library or the dynamic library for `sherpa-onnx`_. + +To achieve that, you can either build `sherpa-onnx`_ from source ``or`` download +pre-built libraries from + + ``_ + +1. Build sherpa-onnx from source +:::::::::::::::::::::::::::::::: + +The following code builds shared libraries for `sherpa-onnx`_: + +.. code-block:: bash + + mkdir -p $HOME/open-source/ + cd $HOME/open-source/ + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + # The build directory must be named as "build" + # for shared libraries + + mkdir build + cd build + + cmake \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=./install \ + .. + cmake --build . --target install --config Release + +The following code builds static libraries for `sherpa-onnx`_: + +.. code-block:: bash + + mkdir -p $HOME/open-source/ + cd $HOME/open-source/ + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + # The build directory must be named as "build-static" + # for shared libraries + + mkdir build-static + cd build-static + + cmake \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=./install \ + .. + cmake --build . --target install --config Release + +.. caution:: + + - For building shared libraries, the build directory must be ``build``. + + - For building static libraries, the build directory must be ``build-static``. + + If you want to learn why there are such constraints, please search for + ``build-static`` in the file `generate_subtitles.lpi `_ + +2. Download pre-built libraries +::::::::::::::::::::::::::::::: + +If you don't want to build `sherpa-onnx`_ from source, please download pre-built libraries +from ``_. + +We suggest that you always use the latest release. + +.. list-table:: + + * - **** + - Required dynamic library files + - Required static library files + * - Windows + - ``sherpa-onnx-c-api.dll``, ``onnxruntime.dll`` + - ``N/A`` (We only support dynamic linking with sherpa-onnx in Lazarus on Windows) + * - Linux + - - ``libsherpa-onnx-c-api.so`` + - ``libonnxruntime.so`` + - - ``libsherpa-onnx-c-api.a`` + - ``libsherpa-onnx-core.a`` + - ``libkaldi-decoder-core.a`` + - ``libsherpa-onnx-kaldifst-core.a`` + - ``libsherpa-onnx-fstfar.a`` + - ``libsherpa-onnx-fst.a`` + - ``libkaldi-native-fbank-core.a`` + - ``libpiper_phonemize`` + - ``liblibespeak-ng.a`` + - ``libucd.a`` + - ``liblibonnxruntime.a`` + - ``libssentencepiece_core.a`` + * - macOS + - - ``libsherpa-onnx-c-api.dylib`` + - ``libonnxruntime.1.17.1.dylib`` + - - ``libsherpa-onnx-c-api.a`` + - ``libsherpa-onnx-core.a`` + - ``libkaldi-decoder-core.a`` + - ``libsherpa-onnx-kaldifst-core.a`` + - ``libsherpa-onnx-fstfar.a`` + - ``libsherpa-onnx-fst.a`` + - ``libkaldi-native-fbank-core.a`` + - ``libpiper_phonemize`` + - ``liblibespeak-ng.a`` + - ``libucd.a`` + - ``liblibonnxruntime.a`` + - ``libssentencepiece_core.a`` + +If you download ``shared`` libraries, please create a ``build`` directory +inside the ``sherpa-onnx`` project directory and put the library files into +``build/install/lib``. An example on my macOS is given below:: + + (py38) fangjuns-MacBook-Pro:sherpa-onnx fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx + (py38) fangjuns-MacBook-Pro:sherpa-onnx fangjun$ ls -lh build/install/lib + total 59696 + -rw-r--r-- 1 fangjun staff 25M Aug 14 14:09 libonnxruntime.1.17.1.dylib + lrwxr-xr-x 1 fangjun staff 27B Aug 14 14:18 libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + -rwxr-xr-x 1 fangjun staff 3.9M Aug 15 15:01 libsherpa-onnx-c-api.dylib + + +If you download ``static`` libraries, please create a ``build-static`` directory +inside the ``sherpa-onnx`` project directory and put the library files into +``build-static/install/lib``. An example on my macOS is given below:: + + (py38) fangjuns-MacBook-Pro:sherpa-onnx fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx + (py38) fangjuns-MacBook-Pro:sherpa-onnx fangjun$ ls -lh build-static/install/lib + total 138176 + -rw-r--r-- 1 fangjun staff 438K Aug 15 15:03 libespeak-ng.a + -rw-r--r-- 1 fangjun staff 726K Aug 15 15:03 libkaldi-decoder-core.a + -rw-r--r-- 1 fangjun staff 198K Aug 15 15:03 libkaldi-native-fbank-core.a + -rw-r--r-- 1 fangjun staff 56M Aug 14 14:25 libonnxruntime.a + -rw-r--r-- 1 fangjun staff 421K Aug 15 15:03 libpiper_phonemize.a + -rw-r--r-- 1 fangjun staff 87K Aug 15 15:03 libsherpa-onnx-c-api.a + -rw-r--r-- 1 fangjun staff 5.7M Aug 15 15:03 libsherpa-onnx-core.a + -rw-r--r-- 1 fangjun staff 2.3M Aug 15 15:03 libsherpa-onnx-fst.a + -rw-r--r-- 1 fangjun staff 30K Aug 15 15:03 libsherpa-onnx-fstfar.a + -rw-r--r-- 1 fangjun staff 1.6M Aug 15 15:03 libsherpa-onnx-kaldifst-core.a + -rw-r--r-- 1 fangjun staff 131K Aug 15 15:03 libsherpa-onnx-portaudio_static.a + -rw-r--r-- 1 fangjun staff 147K Aug 15 15:03 libssentencepiece_core.a + -rw-r--r-- 1 fangjun staff 197K Aug 15 15:03 libucd.a + +Build the generate_subtitles project +------------------------------------ + +Now you can start Lazarus and open `generate_subtitles.lpi `_ . + +Click the menu ``Run`` -> ``Compile``. It should be able to build the project without any errors. + +.. hint:: + + Please ignore warnings, if there are any. + +After building, you should find the following files inside the directory `generate_subtitles `_: + +.. tabs:: + + .. tab:: macOS + + .. code-block:: + + (py38) fangjuns-MacBook-Pro:generate_subtitles fangjun$ pwd + /Users/fangjun/open-source/sherpa-onnx/lazarus-examples/generate_subtitles + (py38) fangjuns-MacBook-Pro:generate_subtitles fangjun$ ls -lh generate_subtitles generate_subtitles.app/ + -rwxr-xr-x 1 fangjun staff 25M Aug 15 20:44 generate_subtitles + + generate_subtitles.app/: + total 0 + drwxr-xr-x 6 fangjun staff 192B Aug 14 23:01 Contents + + .. tab:: Windows + + .. code-block:: bash + + fangjun@M-0LQSDCC2RV398 C:\Users\fangjun\open-source\sherpa-onnx\lazarus-examples\generate_subtitles>dir generate_subtitles.exe + Volume in drive C is 系统 + Volume Serial Number is 8E17-A21F + + Directory of C:\Users\fangjun\open-source\sherpa-onnx\lazarus-examples\generate_subtitles + + 08/15/2024 09:39 PM 2,897,408 generate_subtitles.exe + 1 File(s) 2,897,408 bytes + 0 Dir(s) 38,524,039,168 bytes free + + .. tab:: Linux + + .. code-block:: bash + + cd lazarus-examples/generate_subtitles + ls -lh generate_subtitles + + -rwxr-xr-x 1 runner docker 3.1M Aug 16 03:37 generate_subtitles + +Now you can start the generated executable ``generate_subtitles`` and you should +get the screenshot like the one listed at the start of this section. + +If you get any issues about ``shared libraries not found``, please copy the shared +library files from ``build/install/lib`` to the directory ``lazarus-examples/generate_subtitles`` +or you can set the environment variable ``DYLD_LIBRARY_PATH`` (for macOS) and ``LD_LIBRARY_PATH`` (for Linux). + +Download models +--------------- + +The generated executable expects that there are model files located in the same directory. + +Download the VAD model +:::::::::::::::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +and put ``silero_vad.onnx`` into ``lazarus-examples/generate_subtitles``. + +.. hint:: + + If you are using macOS, please put it into ``lazarus-examples/generate_subtitles/generate_subtitles.app/Contents/Resources/`` + + +Download a speech recognition model +::::::::::::::::::::::::::::::::::: + +The executable expects a non-streaming speech recognition model. Currently, we have supported the following +types of models + + - Whisper + - Moonshine + - Zipformer transducer + - NeMo transducer + - SenseVoice + - Paraformer + - TeleSpeech CTC + +You can download them from ``_ + +Note that you have to rename the model files after downloading. + +.. list-table:: + + * - **** + - Expected filenames + * - Whisper + - - tokens.txt + - whisper-encoder.onnx + - whisper-decoder.onnx + * - Moonshine + - - tokens.txt + - moonshine-preprocessor.onnx + - moonshine-encoder.onnx + - moonshine-uncached-decoder.onnx + - moonshine-cached-decoder.onnx + * - Zipformer transducer + - - tokens.txt + - transducer-encoder.onnx + - transducer-decoder.onnx + - transducer-joiner.onnx + * - NeMo transducer + - - tokens.txt + - nemo-transducer-encoder.onnx + - nemo-transducer-decoder.onnx + - nemo-transducer-joiner.onnx + * - SenseVoice + - - tokens.txt + - sense-voice.onnx + * - Paraformer + - - tokens.txt + - paraformer.onnx + * - TeleSpeech + - - tokens.txt + - telespeech.onnx + +We give several examples below. + +1. Wisper +::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 + + cd sherpa-onnx-whisper-tiny.en + + mv -v tiny.en-encoder.int8.onnx ../whisper-encoder.onnx + mv -v tiny.en-decoder.int8.onnx ../whisper-decoder.onnx + mv -v tiny.en-tokens.txt ../tokens.txt + + cd .. + rm -rf sherpa-onnx-whisper-tiny.en + +You can replace ``tiny.en`` with other types of Whisper models, e.g., ``tiny``, ``base``, etc. + +2. Zipformer transducer +::::::::::::::::::::::: + +We give two examples below for Zipformer transducer + +**Example 1** + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/icefall-asr-zipformer-wenetspeech-20230615.tar.bz2 + tar xvf icefall-asr-zipformer-wenetspeech-20230615.tar.bz2 + rm icefall-asr-zipformer-wenetspeech-20230615.tar.bz2 + + cd icefall-asr-zipformer-wenetspeech-20230615 + + mv -v data/lang_char/tokens.txt ../ + + mv -v exp/encoder-epoch-12-avg-4.int8.onnx ../transducer-encoder.onnx + mv -v exp/decoder-epoch-12-avg-4.onnx ../transducer-decoder.onnx + mv -v exp/joiner-epoch-12-avg-4.int8.onnx ../transducer-joiner.onnx + + cd .. + rm icefall-asr-zipformer-wenetspeech-20230615 + +**Example 2** + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 + tar xvf sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 + rm sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 + + cd sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01 + + mv ./tokens.txt ../ + mv encoder-epoch-99-avg-1.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-99-avg-1.onnx ../transducer-decoder.onnx + mv joiner-epoch-99-avg-1.int8.onnx ../transducer-joiner.onnx + + cd ../ + + rm -rf sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01 + +3. NeMo transducer +:::::::::::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + tar xvf sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + rm sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + + cd sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k + + mv tokens.txt ../ + mv encoder.onnx ../nemo-transducer-encoder.onnx + mv decoder.onnx ../nemo-transducer-decoder.onnx + mv joiner.onnx ../nemo-transducer-joiner.onnx + + cd ../ + rm -rf sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k + +4. SenseVoice +::::::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + cd sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + + mv tokens.txt ../ + mv model.int8.onnx ../sense-voice.onnx + + cd ../ + rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + +5. Paraformer +::::::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + + cd sherpa-onnx-paraformer-zh-2023-09-14 + + mv tokens.txt ../ + mv model.int8.onnx ../paraformer.onnx + + cd ../ + rm -rf sherpa-onnx-paraformer-zh-2023-09-14 + +6. TeleSpeech +::::::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + + cd sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 + + mv tokens.txt ../ + mv model.int8.onnx ../telespeech.onnx + + cd ../ + rm -rf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 + +7. Moonshine +::::::::::::: + +.. code-block:: bash + + cd lazarus-examples/generate_subtitles + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + + cd sherpa-onnx-moonshine-tiny-en-int8 + + mv preprocess.onnx ../moonshine-preprocessor.onnx + mv encode.int8.onnx ../moonshine-encoder.onnx + mv uncached_decode.int8.onnx ../moonshine-uncached-decoder.onnx + mv cached_decode.int8.onnx ../moonshine-cached-decoder.onnx + + mv tokens.txt ../ + + cd ../ + rm -rf sherpa-onnx-moonshine-tiny-en-int8 + +For the more curious +-------------------- + +If you want to find out how we generate the APPs in +``_, +please have a look at + + - ``_ + - ``_ + - ``_ diff --git a/docs/source/onnx/lazarus/index.rst b/docs/source/onnx/lazarus/index.rst new file mode 100644 index 000000000..ebd48c7bd --- /dev/null +++ b/docs/source/onnx/lazarus/index.rst @@ -0,0 +1,20 @@ +.. _sherpa-onnx-lazarus: + +Lazarus +======= + +We also provide examples for developing with ``_ +using `Object Pascal `_. + +We provide support for the following platforms and architectures: + + - Linux-x64 + - Windows-x64 + - macOS-x64 + - macOS-arm64 + +.. toctree:: + :maxdepth: 5 + + ./pre-built-app.rst + ./generate-subtitles.rst diff --git a/docs/source/onnx/lazarus/pic/generate-subtitles/linux-x64.jpg b/docs/source/onnx/lazarus/pic/generate-subtitles/linux-x64.jpg new file mode 100644 index 000000000..080b404c9 Binary files /dev/null and b/docs/source/onnx/lazarus/pic/generate-subtitles/linux-x64.jpg differ diff --git a/docs/source/onnx/lazarus/pic/generate-subtitles/macos-x64.jpg b/docs/source/onnx/lazarus/pic/generate-subtitles/macos-x64.jpg new file mode 100644 index 000000000..ca05adc4f Binary files /dev/null and b/docs/source/onnx/lazarus/pic/generate-subtitles/macos-x64.jpg differ diff --git a/docs/source/onnx/lazarus/pic/generate-subtitles/windows-x64.jpg b/docs/source/onnx/lazarus/pic/generate-subtitles/windows-x64.jpg new file mode 100644 index 000000000..ae042e4ae Binary files /dev/null and b/docs/source/onnx/lazarus/pic/generate-subtitles/windows-x64.jpg differ diff --git a/docs/source/onnx/lazarus/pre-built-app.rst b/docs/source/onnx/lazarus/pre-built-app.rst new file mode 100644 index 000000000..5a8506a2b --- /dev/null +++ b/docs/source/onnx/lazarus/pre-built-app.rst @@ -0,0 +1,26 @@ +Pre-built APPs using Lazarus +---------------------------- + +This page lists some pre-built APPs using Lazarus with Object Pascal. + +.. hint:: + + It runs locally on CPU without accessing the network. + + All you need is to download it, unzip it, and then double click it to run it. + + No need to install anything! + + 本地 ``CPU`` 运行,无需联网。 + + 下载、解压、双击即可。不需要额外安装任何东西 + +.. list-table:: + + * - **** + - 中国用户 + - URL + * - Generate subtitles (生成字幕) + - `点这里 `_ + - ``_ + diff --git a/docs/source/onnx/moonshine/android.rst b/docs/source/onnx/moonshine/android.rst new file mode 100644 index 000000000..79892d2cd --- /dev/null +++ b/docs/source/onnx/moonshine/android.rst @@ -0,0 +1,20 @@ +Android APKs for Moonshine +========================== + +You can find Android APKs for `Moonshine`_ with VAD at the following page + + ``_ + +.. figure:: ./pic/moonshine-vad-apk.jpg + :alt: screenshot of APKs for Moonshine + :align: center + :width: 600 + + Android APKs about `Moonshine`_ + VAD for speech recognition + +The source code for the APK can be found at + + ``_ + +Please see :ref:`sherpa-onnx-android` for more details. + diff --git a/docs/source/onnx/moonshine/c.rst b/docs/source/onnx/moonshine/c.rst new file mode 100644 index 000000000..2c0f27a06 --- /dev/null +++ b/docs/source/onnx/moonshine/c.rst @@ -0,0 +1,13 @@ +C API examples +============== + +Please see + + ``_ + +and :ref:`sherpa-onnx-c-api`. + +If you want to use the C++ API, which is just a wrapper around the C API, please see +the following example: + + ``_ diff --git a/docs/source/onnx/moonshine/code/sherpa-onnx-moonshine-base-en-int8.txt b/docs/source/onnx/moonshine/code/sherpa-onnx-moonshine-base-en-int8.txt new file mode 100644 index 000000000..b21477580 --- /dev/null +++ b/docs/source/onnx/moonshine/code/sherpa-onnx-moonshine-base-en-int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --moonshine-preprocessor=./sherpa-onnx-moonshine-base-en-int8/preprocess.onnx --moonshine-encoder=./sherpa-onnx-moonshine-base-en-int8/encode.int8.onnx --moonshine-uncached-decoder=./sherpa-onnx-moonshine-base-en-int8/uncached_decode.int8.onnx --moonshine-cached-decoder=./sherpa-onnx-moonshine-base-en-int8/cached_decode.int8.onnx --tokens=./sherpa-onnx-moonshine-base-en-int8/tokens.txt --num-threads=1 ./sherpa-onnx-moonshine-base-en-int8/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), moonshine=OfflineMoonshineModelConfig(preprocessor="./sherpa-onnx-moonshine-base-en-int8/preprocess.onnx", encoder="./sherpa-onnx-moonshine-base-en-int8/encode.int8.onnx", uncached_decoder="./sherpa-onnx-moonshine-base-en-int8/uncached_decode.int8.onnx", cached_decoder="./sherpa-onnx-moonshine-base-en-int8/cached_decode.int8.onnx"), telespeech_ctc="", tokens="./sherpa-onnx-moonshine-base-en-int8/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-moonshine-base-en-int8/test_wavs/0.wav +{"lang": "", "emotion": "", "event": "", "text": " After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels.", "timestamps": [], "tokens":[" After", " early", " night", "fall", ",", " the", " yellow", " l", "amps", " would", " light", " up", " here", " and", " there", " the", " squ", "al", "id", " quarter", " of", " the", " bro", "th", "els", "."], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.438 s +Real time factor (RTF): 0.438 / 6.625 = 0.066 diff --git a/docs/source/onnx/moonshine/code/sherpa-onnx-moonshine-tiny-en-int8.txt b/docs/source/onnx/moonshine/code/sherpa-onnx-moonshine-tiny-en-int8.txt new file mode 100644 index 000000000..d2022d99e --- /dev/null +++ b/docs/source/onnx/moonshine/code/sherpa-onnx-moonshine-tiny-en-int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt --num-threads=1 ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), moonshine=OfflineMoonshineModelConfig(preprocessor="./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx", encoder="./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx", uncached_decoder="./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx", cached_decoder="./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx"), telespeech_ctc="", tokens="./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav +{"lang": "", "emotion": "", "event": "", "text": " After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels.", "timestamps": [], "tokens":[" After", " early", " night", "fall", ",", " the", " yellow", " l", "amps", " would", " light", " up", " here", " and", " there", " the", " squ", "al", "id", " quarter", " of", " the", " bro", "th", "els", "."], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.213 s +Real time factor (RTF): 0.213 / 6.625 = 0.032 diff --git a/docs/source/onnx/moonshine/csharp.rst b/docs/source/onnx/moonshine/csharp.rst new file mode 100644 index 000000000..ca73456ab --- /dev/null +++ b/docs/source/onnx/moonshine/csharp.rst @@ -0,0 +1,10 @@ +C# API examples +=============== + +Please see + + ``_ + +and + + ``_ diff --git a/docs/source/onnx/moonshine/dart.rst b/docs/source/onnx/moonshine/dart.rst new file mode 100644 index 000000000..be53d68b8 --- /dev/null +++ b/docs/source/onnx/moonshine/dart.rst @@ -0,0 +1,11 @@ +Dart API examples +================= + +Please see + + - Decoding a file: ``_ + +and + + - Decoding a file with VAD: ``_ + diff --git a/docs/source/onnx/moonshine/go.rst b/docs/source/onnx/moonshine/go.rst new file mode 100644 index 000000000..30a9c712b --- /dev/null +++ b/docs/source/onnx/moonshine/go.rst @@ -0,0 +1,11 @@ +Go API examples +=============== + +Please see + + ``_ + +and + + ``_ + diff --git a/docs/source/onnx/moonshine/huggingface-space.rst b/docs/source/onnx/moonshine/huggingface-space.rst new file mode 100644 index 000000000..4318ef81b --- /dev/null +++ b/docs/source/onnx/moonshine/huggingface-space.rst @@ -0,0 +1,28 @@ +Huggingface space +================= + +You can try `Moonshine`_ with `sherpa-onnx`_ with the following huggingface spaces + + - For short audio: ``_ + - For generating subtitles (support very long audio/video files): ``_ + +.. hint:: + + You don't need to install anything. All you need is a browser. + + You can even run it on your phone or tablet. + +.. figure:: ./pic/moonshine-hf-space-1.jpg + :alt: screenshot of hf space for Moonshine + :align: center + :width: 600 + + Try `Moonshine`_ in our Huggingface space with `sherpa-onnx`_ for short audio + + +.. figure:: ./pic/moonshine-hf-space-2.jpg + :alt: screenshot of hf space for Moonshine about generating subtitles + :align: center + :width: 600 + + Try `Moonshine`_ in our Huggingface space with `sherpa-onnx`_ for generating subtitles diff --git a/docs/source/onnx/moonshine/index.rst b/docs/source/onnx/moonshine/index.rst new file mode 100644 index 000000000..5c622b478 --- /dev/null +++ b/docs/source/onnx/moonshine/index.rst @@ -0,0 +1,22 @@ +Moonshine +========= + +This section describes how to use models from ``_. + +.. toctree:: + :maxdepth: 5 + + ./huggingface-space.rst + ./models.rst + ./android.rst + ./c.rst + ./csharp.rst + ./dart.rst + ./go.rst + ./java.rst + ./javascript.rst + ./kotlin.rst + ./pascal.rst + ./python.rst + ./swift.rst + diff --git a/docs/source/onnx/moonshine/java.rst b/docs/source/onnx/moonshine/java.rst new file mode 100644 index 000000000..c4ab29f1b --- /dev/null +++ b/docs/source/onnx/moonshine/java.rst @@ -0,0 +1,11 @@ +Java API examples +================= + +Please see + + - ``_ + - ``_ + +and :ref:`sherpa-onnx-java-api`. + + diff --git a/docs/source/onnx/moonshine/javascript.rst b/docs/source/onnx/moonshine/javascript.rst new file mode 100644 index 000000000..14ac618f8 --- /dev/null +++ b/docs/source/onnx/moonshine/javascript.rst @@ -0,0 +1,39 @@ +JavaScript API examples +======================= + +We provide two npm packages. + +WebAssembly based npm package +----------------------------- + +You can find the package at + + ``_ + +This package does not support multi-threading. + +The example for `Moonshine`_ can be found at + + - ``_ + - ``_ + +node-addon based npm package +---------------------------- + +You can find the package at + + ``_ + +This package supports multi-threading. + +Please see + + ``_ + +for installation. + +The example for `Moonshine`_ can be found at + + - ``_ + - ``_ + diff --git a/docs/source/onnx/moonshine/kotlin.rst b/docs/source/onnx/moonshine/kotlin.rst new file mode 100644 index 000000000..12f815b3c --- /dev/null +++ b/docs/source/onnx/moonshine/kotlin.rst @@ -0,0 +1,7 @@ +Kotlin API examples +=================== + +Please see + + ``_ + diff --git a/docs/source/onnx/moonshine/models.rst b/docs/source/onnx/moonshine/models.rst new file mode 100644 index 000000000..c44fca7b8 --- /dev/null +++ b/docs/source/onnx/moonshine/models.rst @@ -0,0 +1,182 @@ +Models +====== + +We provide 8-bit quantized ONNX models for `Moonshine`_. + +You can find scripts for model quantization at + + ``_. + +In the following, we describe how to use `Moonshine`_ models with pre-built executables +in `sherpa-onnx`_. + +.. _sherpa-onnx-moonshine-tiny-en-int8: + +sherpa-onnx-moonshine-tiny-en-int8 +---------------------------------- + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-moonshine-tiny-en-int8/ + total 242160 + -rw-r--r-- 1 fangjun staff 1.0K Oct 26 09:42 LICENSE + -rw-r--r-- 1 fangjun staff 175B Oct 26 09:42 README.md + -rw-r--r-- 1 fangjun staff 43M Oct 26 09:42 cached_decode.int8.onnx + -rw-r--r-- 1 fangjun staff 17M Oct 26 09:42 encode.int8.onnx + -rw-r--r-- 1 fangjun staff 6.5M Oct 26 09:42 preprocess.onnx + drwxr-xr-x 6 fangjun staff 192B Oct 26 09:42 test_wavs + -rw-r--r-- 1 fangjun staff 426K Oct 26 09:42 tokens.txt + -rw-r--r-- 1 fangjun staff 51M Oct 26 09:42 uncached_decode.int8.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \ + --num-threads=1 \ + ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code/sherpa-onnx-moonshine-tiny-en-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt + +sherpa-onnx-moonshine-base-en-int8 +---------------------------------- + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-base-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-base-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-base-en-int8.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-moonshine-base-en-int8/ + total 560448 + -rw-r--r-- 1 fangjun staff 1.0K Oct 26 09:42 LICENSE + -rw-r--r-- 1 fangjun staff 175B Oct 26 09:42 README.md + -rw-r--r-- 1 fangjun staff 95M Oct 26 09:42 cached_decode.int8.onnx + -rw-r--r-- 1 fangjun staff 48M Oct 26 09:42 encode.int8.onnx + -rw-r--r-- 1 fangjun staff 13M Oct 26 09:42 preprocess.onnx + drwxr-xr-x 6 fangjun staff 192B Oct 26 09:42 test_wavs + -rw-r--r-- 1 fangjun staff 426K Oct 26 09:42 tokens.txt + -rw-r--r-- 1 fangjun staff 116M Oct 26 09:42 uncached_decode.int8.onnx + + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-base-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-base-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-base-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-base-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-base-en-int8/tokens.txt \ + --num-threads=1 \ + ./sherpa-onnx-moonshine-base-en-int8/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code/sherpa-onnx-moonshine-base-en-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-base-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-base-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-base-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-base-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-base-en-int8/tokens.txt + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-base-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-base-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-base-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-base-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-base-en-int8/tokens.txt diff --git a/docs/source/onnx/moonshine/pascal.rst b/docs/source/onnx/moonshine/pascal.rst new file mode 100644 index 000000000..eb6ac190f --- /dev/null +++ b/docs/source/onnx/moonshine/pascal.rst @@ -0,0 +1,9 @@ +Pascal API examples +=================== + +Please see + + - ``_ + - ``_ + + diff --git a/docs/source/onnx/moonshine/pic/moonshine-hf-space-1.jpg b/docs/source/onnx/moonshine/pic/moonshine-hf-space-1.jpg new file mode 100644 index 000000000..b317e8b0a Binary files /dev/null and b/docs/source/onnx/moonshine/pic/moonshine-hf-space-1.jpg differ diff --git a/docs/source/onnx/moonshine/pic/moonshine-hf-space-2.jpg b/docs/source/onnx/moonshine/pic/moonshine-hf-space-2.jpg new file mode 100644 index 000000000..f5aeb4f2b Binary files /dev/null and b/docs/source/onnx/moonshine/pic/moonshine-hf-space-2.jpg differ diff --git a/docs/source/onnx/moonshine/pic/moonshine-vad-apk.jpg b/docs/source/onnx/moonshine/pic/moonshine-vad-apk.jpg new file mode 100644 index 000000000..cd3728c5f Binary files /dev/null and b/docs/source/onnx/moonshine/pic/moonshine-vad-apk.jpg differ diff --git a/docs/source/onnx/moonshine/python.rst b/docs/source/onnx/moonshine/python.rst new file mode 100644 index 000000000..e8722424e --- /dev/null +++ b/docs/source/onnx/moonshine/python.rst @@ -0,0 +1,16 @@ +Python API examples +=================== + +.. note:: + + You need to install `sherpa-onnx>=1.10.30`. + + +Please see + + - ``_ + - ``_ + - ``_ + - ``_ + +for usages. diff --git a/docs/source/onnx/moonshine/swift.rst b/docs/source/onnx/moonshine/swift.rst new file mode 100644 index 000000000..7dea7f8df --- /dev/null +++ b/docs/source/onnx/moonshine/swift.rst @@ -0,0 +1,6 @@ +Swift API examples +================== + +Please see + + ``_ diff --git a/docs/source/onnx/pascal-api/index.rst b/docs/source/onnx/pascal-api/index.rst new file mode 100644 index 000000000..2310a502a --- /dev/null +++ b/docs/source/onnx/pascal-api/index.rst @@ -0,0 +1,197 @@ +.. _sherpa-onnx-pascal-api: + +Pascal API +========== + +We provide APIs for `Object Pascal `_. + +In other words, you can develop the following types of applications using Object Pascal: + + - Voice activity detection + - Streaming speech recognition (i.e., real-time speech recognition) + - Non-streaming speech recognition + +on Windows, Linux, and macOS. + +.. hint:: + + For macOS, both Apple Silicon (i.e., macOS arm64, M1/M2/M3) and Intel chips + are supported. + +.. note:: + + We will support text-to-speech, audio tagging, keyword spotting, + speaker recognition, speech identification, and spoken language identification + with object pascal later. + +In the following, we describe how to use the object pascal API to decode files. + +We use macOS below as an example. You can adapt it for Linux and Windows. + +.. hint:: + + We support both static link and dynamic link; the example below uses + dynamic link. You can pass ``-DBUILD_SHARED_LIBS=OFF`` to ``cmake`` if you + want to use static link. + + + On the Windows platform, it supports only dynamic link though. + +Install free pascal +------------------- + +Please visit +``_ +for installation. + +To check that you have installed ``fpc`` successfully, please run:: + + fpc -h + +which should print the usage information of ``fpc``. + +Build sherpa-onnx +----------------- + +.. code-block:: bash + + mkdir -p $HOME/open-source + cd $HOME/open-source + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + mkdir build + cd build + + cmake \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=./install \ + .. + + cmake --build . --target install --config Release + + ls -lh install/lib + +You should get the following two shared library files:: + + (py38) fangjuns-MacBook-Pro:build fangjun$ ls -lh install/lib/ + total 59696 + -rw-r--r-- 1 fangjun staff 25M Aug 14 14:09 libonnxruntime.1.17.1.dylib + lrwxr-xr-x 1 fangjun staff 27B Aug 14 14:18 libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + -rwxr-xr-x 1 fangjun staff 3.9M Aug 15 15:01 libsherpa-onnx-c-api.dylib + +Non-streaming speech recognition from files +------------------------------------------- + +We use the ``Whisper tiny.en`` model below as an example. + +.. hint:: + + We have hardcoded the model filenames in the code. + +.. code-block:: bash + + cd $HOME/open-source/sherpa-onnx + + cd pascal-api-examples/non-streaming-asr/ + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 + + fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$HOME/open-source/sherpa-onnx/sherpa-onnx/pascal-api \ + -Fl$HOME/open-source/sherpa-onnx/build/install/lib \ + ./whisper.pas + + # It will generate a file ./whisper + +The output logs of the above ``fpc`` command are given below:: + + Free Pascal Compiler version 3.2.2 [2021/05/16] for x86_64 + Copyright (c) 1993-2021 by Florian Klaempfl and others + Target OS: Darwin for x86_64 + Compiling ./whisper.pas + Compiling /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/pascal-api/sherpa_onnx.pas + Assembling sherpa_onnx + Assembling whisper + Linking whisper + ld: warning: dylib (/Users/fangjun/open-source/sherpa-onnx/build/install/lib//libsherpa-onnx-c-api.dylib) was built for newer macOS version (10.14) tha + n being linked (10.8) + 1530 lines compiled, 3.8 sec + +Explanation of the options for the ``fpc`` command: + + - ``-dSHERPA_ONNX_USE_SHARED_LIBS`` + + It defines a symbol ``SHERPA_ONNX_USE_SHARED_LIBS``, which means + we want to use dynamic link in the code. If you omit it, it will use static link. + Please search for the string ``SHERPA_ONNX_USE_SHARED_LIBS`` in the file + ``_ + if you want to learn more. + + - ``-Fu$HOME/open-source/sherpa-onnx/pascal-api`` + + It specifies the unit search path. + Our `sherpa_onnx.pas `_ + is inside the directory ``$HOME/open-source/sherpa-onnx/pascal-api`` and we have to + tell ``fpc`` where to find it. + + - ``-Fl$HOME/sherpa-onnx/build/install/lib`` + + It tells ``fpc`` where to look for ``libsherpa-onnx-c-api.dylib``. + +After running the above ``fpc`` command, we will find an executable file ``whisper`` +in the current directory, i.e., ``$HOME/open-source/sherpa-onnx/pascal-api-examples/non-streaming-asr/whisper``:: + + (py38) fangjuns-MacBook-Pro:non-streaming-asr fangjun$ ls -lh ./whisper + -rwxr-xr-x 1 fangjun staff 2.3M Aug 16 12:13 ./whisper + +If we run it:: + + (py38) fangjuns-MacBook-Pro:non-streaming-asr fangjun$ ./whisper + dyld[23162]: Library not loaded: @rpath/libsherpa-onnx-c-api.dylib + Referenced from: <3AE58F60-4925-335D-89A5-B30FD7D97D7E> /Users/fangjun/open-source/sherpa-onnx/pascal-api-examples/non-streaming-asr/whisper + Reason: tried: '/Users/fangjun/py38/lib/python3.8/site-packages/libsherpa-onnx-c-api.dylib' (no such file), '/usr/local/Cellar/ghostscript/9.55.0/lib/libsherpa-onnx-c-api.dylib' (no such file), '/Users/fangjun/py38/lib/python3.8/site-packages/libsherpa-onnx-c-api.dylib' (no such file), '/usr/local/Cellar/ghostscript/9.55.0/lib/libsherpa-onnx-c-api.dylib' (no such file), '/libsherpa-onnx-c-api.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS@rpath/libsherpa-onnx-c-api.dylib' (no such file), '/usr/local/lib/libsherpa-onnx-c-api.dylib' (no such file), '/usr/lib/libsherpa-onnx-c-api.dylib' (no such file, not in dyld cache) + Abort trap: 6 + +You can see it cannot find ``libsherpa-onnx-c-api.dylib``. + +At the compilation time, we have used ``-Fl$HOME/sherpa-onnx/build/install/lib`` +to tell the compiler ``fpc`` where to find ``libsherpa-onnx-c-api.dylib``. + +At the runtime, we also need to do something to tell the executable where to look +for ``libsherpa-onnx-c-api.dylib``. + +The following command does exactly that:: + + (py38) fangjuns-MacBook-Pro:non-streaming-asr fangjun$ export DYLD_LIBRARY_PATH=$HOME/open-source/sherpa-onnx/build/install/lib:$DYLD_LIBRARY_PATH + (py38) fangjuns-MacBook-Pro:non-streaming-asr fangjun$ ./whisper + TSherpaOnnxOfflineRecognizerResult(Text := After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels., Tokens := [ After, early, night, fall, ,, the, yellow, lamps, would, light, up, here, and, there, the, squ, alid, quarter, of, the, bro, the, ls, .], Timestamps := []) + NumThreads 1 + Elapsed 0.803 s + Wave duration 6.625 s + RTF = 0.803/6.625 = 0.121 + +.. hint:: + + If you are using Linux, please replace ``DYLD_LIBRARY_PATH`` with ``LD_LIBRARY_PATH``. + +Congratulations! You have successfully managed to use the object pascal API with +Whisper for speech recognition! + +You can find more examples at: + + ``_ + +Colab notebook +-------------- + +We provide a colab notebook +|use sherpa-onnx for pascal colab notebook| +for you to try this section step by step. + +.. |use sherpa-onnx for pascal colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_pascal_api_example.ipynb diff --git a/docs/source/onnx/pretrained_models/index.rst b/docs/source/onnx/pretrained_models/index.rst new file mode 100644 index 000000000..c0b18f250 --- /dev/null +++ b/docs/source/onnx/pretrained_models/index.rst @@ -0,0 +1,53 @@ +.. _sherpa-onnx-pre-trained-models: + +Pre-trained models +================== + +The following table lists links for all pre-trained models. + + +.. list-table:: + + * - Description + - URL + * - Speech recognition (speech to text, ASR) + - ``_ + * - Text to speech (TTS) + - ``_ + * - VAD + - ``_ + * - Keyword spotting + - ``_ + * - Speech identification (Speaker ID) + - ``_ + * - Spoken language identification (Language ID) + - ``_ (multi-lingual whisper) + * - Audio tagging + - ``_ + * - Punctuation + - ``_ + + +In this section, we describe how to download and use all +available pre-trained models for speech recognition. + + +.. hint:: + + Please install `git-lfs `_ before you continue. + + Otherwise, you will be ``SAD`` later. + +.. toctree:: + :maxdepth: 5 + + online-transducer/index + online-paraformer/index + online-ctc/index + offline-transducer/index + offline-paraformer/index + offline-ctc/index + telespeech/index + whisper/index + wenet/index + small-online-models diff --git a/docs/source/onnx/pretrained_models/offline-ctc/index.rst b/docs/source/onnx/pretrained_models/offline-ctc/index.rst new file mode 100644 index 000000000..d904e2c1d --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/index.rst @@ -0,0 +1,10 @@ +Offline CTC models +================== + +This section lists available offline CTC models. + +.. toctree:: + :maxdepth: 5 + + nemo/index + yesno/index diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_citrinet_512.txt b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_citrinet_512.txt new file mode 100644 index 000000000..e17bf1cc9 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_citrinet_512.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-nemo-ctc-en-citrinet-512/tokens.txt --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-citrinet-512/model.onnx --num-threads=2 --decoding-method=greedy_search --debug=false ./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/0.wav ./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/1.wav ./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model="./sherpa-onnx-nemo-ctc-en-citrinet-512/model.onnx"), tokens="./sherpa-onnx-nemo-ctc-en-citrinet-512/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:105 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels +---- +./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonoured bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven +---- +./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav + yet these thoughts affected hester prynne less with hope than apprehension +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 4.963 s +Real time factor (RTF): 4.963 / 28.165 = 0.176 diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_large.txt b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_large.txt new file mode 100644 index 000000000..372d1af6f --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_large.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-nemo-ctc-en-conformer-large/tokens.txt --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-large/model.onnx --num-threads=2 --decoding-method=greedy_search --debug=false ./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/0.wav ./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/1.wav ./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model="./sherpa-onnx-nemo-ctc-en-conformer-large/model.onnx"), tokens="./sherpa-onnx-nemo-ctc-en-conformer-large/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:105 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels +---- +./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonored bosom to connect her parent for ever with the race and descent of mortals and to be finally a blesed soul in heaven +---- +./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/8k.wav + yet these thoughts afected hester pryne les with hope than aprehension +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 3.553 s +Real time factor (RTF): 3.553 / 28.165 = 0.126 diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_medium.txt b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_medium.txt new file mode 100644 index 000000000..b0100e934 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_medium.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx --num-threads=2 --decoding-method=greedy_search --debug=false ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model="./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx"), tokens="./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:105 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels +---- +./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonored bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven +---- +./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + yet these thoughts affected hester pryne less with hope than apprehension +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.184 s +Real time factor (RTF): 1.184 / 28.165 = 0.042 diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_small.txt b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_small.txt new file mode 100644 index 000000000..0d384ddf4 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-english/stt_en_conformer_ctc_small.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-small/model.onnx --num-threads=2 --decoding-method=greedy_search --debug=false ./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav ./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/1.wav ./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model="./sherpa-onnx-nemo-ctc-en-conformer-small/model.onnx"), tokens="./sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:105 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels +---- +./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonoured bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven +---- +./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/8k.wav + yet these thoughts affected hester prin less with hope than apprehension +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.665 s +Real time factor (RTF): 0.665 / 28.165 = 0.024 diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-russian/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.int8.txt b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-russian/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.int8.txt new file mode 100644 index 000000000..ac66b4af9 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/code-russian/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --nemo-ctc-model=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx --tokens=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt ./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model="./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx"), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav +{"lang": "", "emotion": "", "event": "", "text": "ничьих не требуя похвал счастлив уж я надеждой сладкой что дева с трепетом любви посмотрит может быть украдкой на песни грешные мои у лукоморья дуп зеленый", "timestamps": [0.04, 0.12, 0.20, 0.24, 0.32, 0.40, 0.44, 0.56, 0.60, 0.64, 0.72, 0.76, 0.80, 0.84, 0.88, 1.00, 1.04, 1.16, 1.20, 1.28, 1.36, 1.40, 1.48, 1.64, 1.76, 1.84, 1.88, 1.92, 2.00, 2.04, 2.08, 2.16, 2.20, 2.28, 2.36, 2.40, 2.52, 2.56, 2.68, 2.72, 2.80, 2.84, 2.92, 3.00, 3.04, 3.08, 3.12, 3.20, 3.28, 3.32, 3.36, 3.44, 3.48, 3.56, 3.60, 3.68, 3.72, 3.76, 3.80, 3.88, 3.96, 4.00, 4.04, 4.12, 4.20, 4.24, 4.32, 4.36, 4.40, 4.48, 4.52, 4.56, 4.64, 4.68, 4.76, 4.88, 4.92, 4.96, 5.04, 5.08, 5.20, 5.40, 5.44, 5.56, 5.64, 5.68, 5.72, 5.80, 5.84, 5.92, 5.96, 6.08, 6.12, 6.16, 6.20, 6.24, 6.28, 6.36, 6.40, 6.48, 6.52, 6.56, 6.64, 6.72, 6.76, 6.80, 6.84, 6.96, 7.00, 7.04, 7.08, 7.20, 7.24, 7.28, 7.36, 7.40, 7.44, 7.52, 7.56, 7.64, 7.72, 7.80, 7.84, 7.92, 8.04, 8.08, 8.16, 8.20, 8.32, 8.36, 8.44, 9.12, 9.28, 9.32, 9.44, 9.48, 9.56, 9.60, 9.72, 9.76, 9.88, 9.92, 10.04, 10.08, 10.20, 10.24, 10.36, 10.40, 10.52, 10.56, 10.64, 10.68, 10.80, 10.84, 10.92], "tokens":["н", "и", "ч", "ь", "и", "х", " ", "н", "е", " ", "т", "р", "е", "б", "у", "я", " ", "п", "о", "х", "в", "а", "л", " ", "с", "ч", "а", "с", "т", "л", "и", "в", " ", "у", "ж", " ", "я", " ", "н", "а", "д", "е", "ж", "д", "о", "й", " ", "с", "л", "а", "д", "к", "о", "й", " ", "ч", "т", "о", " ", "д", "е", "в", "а", " ", "с", " ", "т", "р", "е", "п", "е", "т", "о", "м", " ", "л", "ю", "б", "в", "и", " ", "п", "о", "с", "м", "о", "т", "р", "и", "т", " ", "м", "о", "ж", "е", "т", " ", "б", "ы", "т", "ь", " ", "у", "к", "р", "а", "д", "к", "о", "й", " ", "н", "а", " ", "п", "е", "с", "н", "и", " ", "г", "р", "е", "ш", "н", "ы", "е", " ", "м", "о", "и", " ", "у", " ", "л", "у", "к", "о", "м", "о", "р", "ь", "я", " ", "д", "у", "п", " ", "з", "е", "л", "е", "н", "ы", "й"], "words": []} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.868 s +Real time factor (RTF): 1.868 / 11.290 = 0.165 diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/english.rst b/docs/source/onnx/pretrained_models/offline-ctc/nemo/english.rst new file mode 100644 index 000000000..41f611baa --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/english.rst @@ -0,0 +1,299 @@ +English +======= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +.. note:: + + We use `./build/bin/sherpa-offline `_ + as an example in this section. You can use other scripts such as + + - `./build/bin/sherpa-onnx-microphone-offline `_ + - `./build/bin/sherpa-onnx-offline-websocket-server `_ + - `python-api-examples/offline-decode-files.py `_ + +This page lists offline CTC models from `NeMo`_ for English. + +stt_en_citrinet_512 +------------------- + +This model is converted from + + ``_ + +Citrinet-512 model which has been trained on the ASR Set dataset +with over 7000 hours of english speech. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-citrinet-512.tar.bz2 + + tar xvf sherpa-onnx-nemo-ctc-en-citrinet-512.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-citrinet-512.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-nemo-ctc-en-citrinet-512 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 36M Apr 7 16:10 model.int8.onnx + -rw-r--r-- 1 fangjun staff 142M Apr 7 14:24 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +The following code shows how to use ``fp32`` models to decode wave files. +Please replace ``model.onnx`` with ``model.int8.onnx`` to use ``int8`` +quantized model. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-nemo-ctc-en-citrinet-512/tokens.txt \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-citrinet-512/model.onnx \ + --num-threads=2 \ + --decoding-method=greedy_search \ + --debug=false \ + ./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-english/stt_en_citrinet_512.txt + +stt_en_conformer_ctc_small +-------------------------- + +This model is converted from + + ``_ + +It contains small size versions of Conformer-CTC (13M parameters) trained on +NeMo ASRSet with around 16000 hours of english speech. The model transcribes +speech in lower case english alphabet along with spaces and apostrophes. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 + + tar xvf sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-nemo-ctc-en-conformer-small fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 44M Apr 7 20:24 model.int8.onnx + -rw-r--r-- 1 fangjun staff 81M Apr 7 18:56 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +The following code shows how to use ``fp32`` models to decode wave files. +Please replace ``model.onnx`` with ``model.int8.onnx`` to use ``int8`` +quantized model. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-small/model.onnx \ + --num-threads=2 \ + --decoding-method=greedy_search \ + --debug=false \ + ./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-english/stt_en_conformer_ctc_small.txt + +.. _stt-en-conformer-ctc-medium-nemo-sherpa-onnx: + +stt_en_conformer_ctc_medium +--------------------------- + +This model is converted from + + ``_ + +It contains medium size versions of Conformer-CTC (around 30M parameters) +trained on NeMo ASRSet with around 16000 hours of english speech. The model +transcribes speech in lower case english alphabet along with spaces and apostrophes. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + + tar xvf sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-nemo-ctc-en-conformer-medium fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 64M Apr 7 20:44 model.int8.onnx + -rw-r--r-- 1 fangjun staff 152M Apr 7 20:43 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +The following code shows how to use ``fp32`` models to decode wave files. +Please replace ``model.onnx`` with ``model.int8.onnx`` to use ``int8`` +quantized model. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \ + --num-threads=2 \ + --decoding-method=greedy_search \ + --debug=false \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-english/stt_en_conformer_ctc_medium.txt + +stt_en_conformer_ctc_large +--------------------------- + +This model is converted from + + ``_ + +It contains large size versions of Conformer-CTC (around 120M parameters) +trained on NeMo ASRSet with around 24500 hours of english speech. The model +transcribes speech in lower case english alphabet along with spaces and apostrophes + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-large.tar.bz2 + + tar xvf sherpa-onnx-nemo-ctc-en-conformer-large.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-conformer-large.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-nemo-ctc-en-conformer-large fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 162M Apr 7 22:01 model.int8.onnx + -rw-r--r-- 1 fangjun staff 508M Apr 7 22:01 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +The following code shows how to use ``fp32`` models to decode wave files. +Please replace ``model.onnx`` with ``model.int8.onnx`` to use ``int8`` +quantized model. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-nemo-ctc-en-conformer-large/tokens.txt \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-large/model.onnx \ + --num-threads=2 \ + --decoding-method=greedy_search \ + --debug=false \ + ./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-large/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-english/stt_en_conformer_ctc_large.txt diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/how-to-export.rst b/docs/source/onnx/pretrained_models/offline-ctc/nemo/how-to-export.rst new file mode 100644 index 000000000..48a301d5b --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/how-to-export.rst @@ -0,0 +1,75 @@ +How to export models from NeMo to sherpa-onnx +============================================= + +This section describes how to export CTC models from NeMo to `sherpa-onnx`_. + +.. hint:: + + Please refer to ``_ + for a list of pre-trained NeMo models. + + You can use method described in this section to convert more models + to `sherpa-onnx`_. + +Let us take the following model as an example: + +``_. + +.. hint:: + + You can find the exported files in this example by visiting + + ``_ + +The steps to export it to `sherpa-onnx`_ are given below. + +Step 1: Export model.onnx +------------------------- + +The first step is to obtain ``model.onnx``. + +.. code-block:: python + + import nemo.collections.asr as nemo_asr + m = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_conformer_ctc_small') + m.export('model.onnx') + +Step 2: Add metadata +-------------------- + +To be usable in `sherpa-onnx`_, we have to use `add-model-metadata.py `_ to add metadata to ``model.onnx``. + +.. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/add-model-metadata.py + + # The following command changes model.onnx in-place + python3 add-model-metadata.py + + +Step 3: Obtain model.int8.onnx +------------------------------ + +We can use `quantize-model.py `_ to obtain a quantized version of ``model.onnx``: + +.. code-block:: bash + + wget https://huggingface.co/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/quantize-model.py + + # The following command will generate model.int8.onnx + python3 ./quantize-model.py + +Step 4: Obtain tokens.txt +------------------------- + +Use the following command to obtain ``tokens.txt``: + +.. code-block:: python + + import nemo.collections.asr as nemo_asr + m = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_conformer_ctc_small') + + with open('tokens.txt', 'w') as f: + for i, s in enumerate(m.decoder.vocabulary): + f.write(f"{s} {i}\n") + f.write(f" {i+1}\n") diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/index.rst b/docs/source/onnx/pretrained_models/offline-ctc/nemo/index.rst new file mode 100644 index 000000000..d8fcec9db --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/index.rst @@ -0,0 +1,16 @@ +NeMo +==== + +This page lists all offline CTC models from `NeMo`_. + +.. hint:: + + Please refer to ``_ + for a list of pre-trained NeMo models. + +.. toctree:: + :maxdepth: 5 + + how-to-export + english + russian diff --git a/docs/source/onnx/pretrained_models/offline-ctc/nemo/russian.rst b/docs/source/onnx/pretrained_models/offline-ctc/nemo/russian.rst new file mode 100644 index 000000000..f0250136b --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/nemo/russian.rst @@ -0,0 +1,112 @@ +Russian +======= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +This page lists offline CTC models from `NeMo`_ for English. + +sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24 +----------------------------------------------- + +This model is converted from + + ``_ + +You can find the conversion script at + + ``_ + +.. warning:: + + The license of the model can be found at ``_. + + It is for non-commercial use only. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.tar.bz2 + tar xvf sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.tar.bz2 + rm sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/ + total 558904 + -rw-r--r-- 1 fangjun staff 89K Oct 24 21:20 GigaAM%20License_NC.pdf + -rw-r--r-- 1 fangjun staff 318B Oct 24 21:20 README.md + -rwxr-xr-x 1 fangjun staff 3.5K Oct 24 21:20 export-onnx-ctc.py + -rw-r--r-- 1 fangjun staff 262M Oct 24 21:24 model.int8.onnx + -rwxr-xr-x 1 fangjun staff 1.2K Oct 24 21:20 run-ctc.sh + -rwxr-xr-x 1 fangjun staff 4.1K Oct 24 21:20 test-onnx-ctc.py + drwxr-xr-x 4 fangjun staff 128B Oct 24 21:24 test_wavs + -rw-r--r--@ 1 fangjun staff 196B Oct 24 21:31 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx \ + --tokens=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt \ + ./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-russian/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24.int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx \ + --tokens=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx \ + --tokens=./sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt diff --git a/docs/source/onnx/pretrained_models/offline-ctc/yesno/index.rst b/docs/source/onnx/pretrained_models/offline-ctc/yesno/index.rst new file mode 100644 index 000000000..4c412b850 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-ctc/yesno/index.rst @@ -0,0 +1,100 @@ +yesno +===== + +This section describes how to use the `tdnn `_ +model of the `yesno`_ dataset from `icefall`_ in `sherpa-onnx`_. + +.. note:: + + It is a **non-streaming** model and it can only recognize + two words in `Hebrew `_: + ``yes`` and ``no``. + +To download the model, please use: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-tdnn-yesno.tar.bz2 + + tar xvf sherpa-onnx-tdnn-yesno.tar.bz2 + rm sherpa-onnx-tdnn-yesno.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-tdnn-yesno fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 55K Aug 12 17:02 model-epoch-14-avg-2.int8.onnx + -rw-r--r-- 1 fangjun staff 54K Aug 12 17:02 model-epoch-14-avg-2.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +The following code shows how to use ``fp32`` models to decode wave files. +Please replace ``model-epoch-14-avg-2.int8.onnx`` with ``model-epoch-14-avg-2.int8.onnx`` +to use the ``int8`` quantized model. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --sample-rate=8000 \ + --feat-dim=23 \ + --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt \ + --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \ + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_0_1_0_0_0_1.wav \ + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_0_1_0.wav \ + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_1_1_1.wav \ + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_1_0_0_1.wav \ + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_0_0_1.wav \ + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_1_1_0.wav + +The output is given below: + +.. code-block:: bash + + OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=8000, feature_dim=23), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder=""), tdnn=OfflineTdnnModelConfig(model="./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx"), tokens="./sherpa-onnx-tdnn-yesno/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) + Creating recognizer ... + Started + Done! + + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_0_1_0_0_0_1.wav + {"text":"NNNYNNNY","timestamps":"[]","tokens":["N","N","N","Y","N","N","N","Y"]} + ---- + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_0_1_0.wav + {"text":"NNYNNNYN","timestamps":"[]","tokens":["N","N","Y","N","N","N","Y","N"]} + ---- + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_1_1_1.wav + {"text":"NNYNNYYY","timestamps":"[]","tokens":["N","N","Y","N","N","Y","Y","Y"]} + ---- + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_1_0_0_1.wav + {"text":"NNYNYNNY","timestamps":"[]","tokens":["N","N","Y","N","Y","N","N","Y"]} + ---- + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_0_0_1.wav + {"text":"NNYYNNNY","timestamps":"[]","tokens":["N","N","Y","Y","N","N","N","Y"]} + ---- + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_1_1_0.wav + {"text":"NNYYNYYN","timestamps":"[]","tokens":["N","N","Y","Y","N","Y","Y","N"]} + ---- + num threads: 2 + decoding method: greedy_search + Elapsed seconds: 0.071 s + Real time factor (RTF): 0.071 / 38.530 = 0.002 + +.. note:: + + In the above output, ``N`` represents ``NO``, while ``Y`` is ``YES``. + So for the last wave, ``NNYYNYYN`` means ``NO NO YES YES NO YES YES NO``. + + In the filename of the last wave ``0_0_1_1_0_1_1_0.wav``, 0 means ``NO`` + and 1 means ``YES``. So the ground truth of the last wave is + ``NO NO YES YES NO YES YES NO``. diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-en-2024-03-09-int8.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-en-2024-03-09-int8.txt new file mode 100644 index 000000000..f3e186e81 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-en-2024-03-09-int8.txt @@ -0,0 +1,24 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt --paraformer=./sherpa-onnx-paraformer-en-2024-03-09/model.int8.onnx ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/0.wav ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/1.wav ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-en-2024-03-09/model.int8.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/0.wav +{"text": " after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels", "timestamps": [], "tokens":["after", "early", "ni@@", "ght@@", "fall", "the", "yel@@", "low", "la@@", "mp@@", "s", "would", "light", "up", "here", "and", "there", "the", "squ@@", "al@@", "id", "quarter", "of", "the", "bro@@", "the@@", "ls"]} +---- +./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/1.wav +{"text": " god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was 'on' that same dishonoured bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven", "timestamps": [], "tokens":["god", "as", "a", "direct", "con@@", "sequence", "of", "the", "sin", "which", "man", "thus", "p@@", "uni@@", "shed", "had", "given", "her", "a", "lo@@", "vely", "child", "whose", "place", "was", "'on'", "that", "same", "di@@", "sh@@", "on@@", "ou@@", "red", "bo@@", "so@@", "m", "to", "connect", "her", "paren@@", "t", "for", "ever", "with", "the", "race", "and", "des@@", "cent", "of", "mor@@", "tal@@", "s", "and", "to", "be", "finally", "a", "bl@@", "essed", "soul", "in", "hea@@", "ven"]} +---- +./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/8k.wav +{"text": " yet these thoughts affected hester prynne less with hope than apprehension", "timestamps": [], "tokens":["yet", "these", "thoughts", "aff@@", "ected", "he@@", "ster", "pr@@", "y@@", "n@@", "ne", "less", "with", "hope", "than", "ap@@", "pre@@", "hen@@", "sion"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 5.492 s +Real time factor (RTF): 5.492 / 28.165 = 0.195 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-en-2024-03-09.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-en-2024-03-09.txt new file mode 100644 index 000000000..8d47d6870 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-en-2024-03-09.txt @@ -0,0 +1,24 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt --paraformer=./sherpa-onnx-paraformer-en-2024-03-09/model.onnx ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/0.wav ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/1.wav ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-en-2024-03-09/model.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/0.wav +{"text": " after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels", "timestamps": [], "tokens":["after", "early", "ni@@", "ght@@", "fall", "the", "yel@@", "low", "la@@", "mp@@", "s", "would", "light", "up", "here", "and", "there", "the", "squ@@", "al@@", "id", "quarter", "of", "the", "bro@@", "the@@", "ls"]} +---- +./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/1.wav +{"text": " god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was 'on' that same dishonoured bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven", "timestamps": [], "tokens":["god", "as", "a", "direct", "con@@", "sequence", "of", "the", "sin", "which", "man", "thus", "p@@", "uni@@", "shed", "had", "given", "her", "a", "lo@@", "vely", "child", "whose", "place", "was", "'on'", "that", "same", "di@@", "sh@@", "on@@", "ou@@", "red", "bo@@", "so@@", "m", "to", "connect", "her", "paren@@", "t", "for", "ever", "with", "the", "race", "and", "des@@", "cent", "of", "mor@@", "tal@@", "s", "and", "to", "be", "finally", "a", "bl@@", "essed", "soul", "in", "hea@@", "ven"]} +---- +./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/8k.wav +{"text": " yet these thoughts affected hester prynne less with hope than apprehension", "timestamps": [], "tokens":["yet", "these", "thoughts", "aff@@", "ected", "he@@", "ster", "pr@@", "y@@", "n@@", "ne", "less", "with", "hope", "than", "ap@@", "pre@@", "hen@@", "sion"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 7.173 s +Real time factor (RTF): 7.173 / 28.165 = 0.255 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en-int8.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en-int8.txt new file mode 100644 index 000000000..143700caf --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en-int8.txt @@ -0,0 +1,35 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt --paraformer=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/2.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/5-henan.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/6-zh-en.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 0, 13 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 1, 15 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 2, 40 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 3, 41 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 4, 37 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 5, 16 vs -1 +Done! + +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav +{"text": "有无人知道湾仔活道系点去㗎", "timestamps": [], "tokens":["有", "无", "人", "知", "道", "湾", "仔", "活", "道", "系", "点", "去", "㗎"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/2.wav +{"text": "我喺黄大仙九龙塘联合道荡失路啊", "timestamps": [], "tokens":["我", "喺", "黄", "大", "仙", "九", "龙", "塘", "联", "合", "道", "荡", "失", "路", "啊"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演得特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "得", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/4-tianjin.wav +{"text": "其实他就是怕每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "怕", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/5-henan.wav +{"text": "它这个管一下都通到有时候都通到七八层楼高然后它这管一下就可以浇到那那柱子上", "timestamps": [], "tokens":["它", "这", "个", "管", "一", "下", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "它", "这", "管", "一", "下", "就", "可", "以", "浇", "到", "那", "那", "柱", "子", "上"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/6-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["yesterday", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 6.290 s +Real time factor (RTF): 6.290 / 42.054 = 0.150 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.txt new file mode 100644 index 000000000..b6c9c55d4 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.txt @@ -0,0 +1,35 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt --paraformer=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.onnx ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/2.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/5-henan.wav ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/6-zh-en.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 0, 13 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 1, 15 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 2, 40 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 3, 41 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 4, 37 vs -1 +/project/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc:Decode:65 time stamp for batch: 5, 16 vs -1 +Done! + +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav +{"text": "有无人知道湾仔活道系点去㗎", "timestamps": [], "tokens":["有", "无", "人", "知", "道", "湾", "仔", "活", "道", "系", "点", "去", "㗎"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/2.wav +{"text": "我喺黄大仙九龙塘联合道荡失路啊", "timestamps": [], "tokens":["我", "喺", "黄", "大", "仙", "九", "龙", "塘", "联", "合", "道", "荡", "失", "路", "啊"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演得特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "得", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/4-tianjin.wav +{"text": "其实他就是怕每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "怕", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/5-henan.wav +{"text": "它这个管一下都通到有时候都通到七八层楼高然后它这管一下就可以浇到那那柱子上", "timestamps": [], "tokens":["它", "这", "个", "管", "一", "下", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "它", "这", "管", "一", "下", "就", "可", "以", "浇", "到", "那", "那", "柱", "子", "上"]} +---- +./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/6-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["yesterday", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 6.871 s +Real time factor (RTF): 6.871 / 42.054 = 0.163 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28-int8.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28-int8.txt new file mode 100644 index 000000000..1e9cd1462 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28-int8.txt @@ -0,0 +1,39 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/6-zh-en.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav +{"text": "对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你", "timestamps": [], "tokens":["对", "我", "做", "了", "介", "绍", "啊", "那", "么", "我", "想", "说", "的", "是", "呢", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣", "呢", "你"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav +{"text": "重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现", "timestamps": [], "tokens":["重", "点", "呢", "想", "谈", "三", "个", "问", "题", "首", "先", "呢", "就", "是", "这", "一", "轮", "全", "球", "金", "融", "动", "荡", "的", "表", "现"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav +{"text": "深入的分析这一次全球金融动荡背后的根源", "timestamps": [], "tokens":["深", "入", "的", "分", "析", "这", "一", "次", "全", "球", "金", "融", "动", "荡", "背", "后", "的", "根", "源"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演的特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "的", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/4-tianjin.wav +{"text": "其实他就是那每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "那", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav +{"text": "他这个管一向都通到有时候都通到七八层楼高然后他这管一向就可以浇到那个那柱子上", "timestamps": [], "tokens":["他", "这", "个", "管", "一", "向", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "他", "这", "管", "一", "向", "就", "可", "以", "浇", "到", "那", "个", "那", "柱", "子", "上"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/6-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["ye@@", "ster@@", "day", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav +{"text": "甚至出现交易几乎停滞的情况", "timestamps": [], "tokens":["甚", "至", "出", "现", "交", "易", "几", "乎", "停", "滞", "的", "情", "况"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 6.439 s +Real time factor (RTF): 6.439 / 51.236 = 0.126 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28.txt new file mode 100644 index 000000000..48d585ee5 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28.txt @@ -0,0 +1,39 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/6-zh-en.wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav +{"text": "对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你", "timestamps": [], "tokens":["对", "我", "做", "了", "介", "绍", "啊", "那", "么", "我", "想", "说", "的", "是", "呢", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣", "呢", "你"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav +{"text": "重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现", "timestamps": [], "tokens":["重", "点", "呢", "想", "谈", "三", "个", "问", "题", "首", "先", "呢", "就", "是", "这", "一", "轮", "全", "球", "金", "融", "动", "荡", "的", "表", "现"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav +{"text": "深入的分析这一次全球金融动荡背后的根源", "timestamps": [], "tokens":["深", "入", "的", "分", "析", "这", "一", "次", "全", "球", "金", "融", "动", "荡", "背", "后", "的", "根", "源"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演的特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "的", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/4-tianjin.wav +{"text": "其实他就是那每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "那", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav +{"text": "他这个管一向都通到有时候都通到七八层楼高然后他这管一向就可以浇到那个那柱子上", "timestamps": [], "tokens":["他", "这", "个", "管", "一", "向", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "他", "这", "管", "一", "向", "就", "可", "以", "浇", "到", "那", "个", "那", "柱", "子", "上"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/6-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["ye@@", "ster@@", "day", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav +{"text": "甚至出现交易几乎停滞的情况", "timestamps": [], "tokens":["甚", "至", "出", "现", "交", "易", "几", "乎", "停", "滞", "的", "情", "况"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 8.547 s +Real time factor (RTF): 8.547 / 51.236 = 0.167 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-09-14-int8.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-09-14-int8.txt new file mode 100644 index 000000000..b2a594816 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2023-09-14-int8.txt @@ -0,0 +1,39 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt --paraformer=./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx --model-type=paraformer ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/0.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/1.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/2.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/5-henan.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/6-zh-en.wav ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="paraformer"), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/0.wav +{"text": "对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你", "timestamps": [0.36, 0.48, 0.62, 0.72, 0.86, 1.02, 1.32, 1.74, 1.90, 2.12, 2.20, 2.38, 2.50, 2.62, 2.74, 3.18, 3.32, 3.52, 3.62, 3.74, 3.82, 3.90, 3.98, 4.08, 4.20, 4.34, 4.56, 4.74, 5.10], "tokens":["对", "我", "做", "了", "介", "绍", "啊", "那", "么", "我", "想", "说", "的", "是", "呢", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣", "呢", "你"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/1.wav +{"text": "重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现", "timestamps": [0.16, 0.30, 0.42, 0.56, 0.72, 0.96, 1.08, 1.20, 1.30, 2.08, 2.26, 2.44, 2.58, 2.72, 2.98, 3.14, 3.26, 3.46, 3.62, 3.80, 3.88, 4.02, 4.12, 4.20, 4.36, 4.56], "tokens":["重", "点", "呢", "想", "谈", "三", "个", "问", "题", "首", "先", "呢", "就", "是", "这", "一", "轮", "全", "球", "金", "融", "动", "荡", "的", "表", "现"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/2.wav +{"text": "深入的分析这一次全球金融动荡背后的根源", "timestamps": [0.34, 0.54, 0.66, 0.80, 1.08, 1.52, 1.72, 1.90, 2.40, 2.68, 2.86, 2.96, 3.16, 3.26, 3.46, 3.54, 3.66, 3.80, 3.90], "tokens":["深", "入", "的", "分", "析", "这", "一", "次", "全", "球", "金", "融", "动", "荡", "背", "后", "的", "根", "源"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演的特别好就是好像很真实一样你知道吧", "timestamps": [0.16, 0.30, 0.56, 0.72, 0.92, 1.18, 1.32, 1.88, 2.24, 2.40, 3.16, 3.28, 3.40, 3.54, 3.76, 3.88, 4.06, 4.24, 4.36, 4.56, 4.66, 4.88, 5.14, 5.30, 5.44, 5.60, 5.72, 5.84, 5.96, 6.14, 6.24, 6.38, 6.56, 6.78, 6.98, 7.08, 7.22, 7.38, 7.50, 7.62], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "的", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/4-tianjin.wav +{"text": "其实他就是那每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [0.08, 0.24, 0.36, 0.56, 0.66, 0.78, 1.04, 1.14, 1.26, 1.38, 1.50, 1.58, 1.70, 1.84, 2.28, 2.38, 2.64, 2.74, 3.08, 3.28, 3.66, 3.80, 3.94, 4.14, 4.34, 4.64, 4.84, 4.94, 5.12, 5.24, 5.84, 6.10, 6.24, 6.44, 6.54, 6.66, 6.86, 7.02, 7.14, 7.24, 7.44], "tokens":["其", "实", "他", "就", "是", "那", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/5-henan.wav +{"text": "他这个管一向都通到有时候都通到七八层楼高然后他这管一向就可以浇到那个那柱子上", "timestamps": [0.08, 0.20, 0.30, 0.42, 0.94, 1.14, 1.26, 1.46, 1.66, 2.28, 2.50, 2.62, 2.70, 2.82, 2.98, 3.14, 3.28, 3.52, 3.70, 3.86, 4.94, 5.06, 5.18, 5.30, 5.42, 5.66, 5.76, 5.94, 6.08, 6.24, 6.38, 6.60, 6.78, 6.96, 7.10, 7.30, 7.50, 7.62], "tokens":["他", "这", "个", "管", "一", "向", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "他", "这", "管", "一", "向", "就", "可", "以", "浇", "到", "那", "个", "那", "柱", "子", "上"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/6-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [0.36, 0.60, 0.84, 1.22, 2.24, 2.44, 2.74, 3.52, 4.06, 4.68, 5.00, 5.12, 5.76, 5.96, 6.24, 6.82, 7.02, 7.26], "tokens":["ye@@", "ster@@", "day", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/8k.wav +{"text": "甚至出现交易几乎停滞的情况", "timestamps": [0.48, 0.78, 1.04, 1.18, 1.52, 1.78, 2.06, 2.18, 2.50, 2.66, 2.88, 3.10, 3.30], "tokens":["甚", "至", "出", "现", "交", "易", "几", "乎", "停", "滞", "的", "情", "况"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 9.206 s +Real time factor (RTF): 9.206 / 51.236 = 0.180 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09-int8.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09-int8.txt new file mode 100644 index 000000000..5ead0e44d --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09-int8.txt @@ -0,0 +1,36 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt --paraformer=./sherpa-onnx-paraformer-zh-2024-03-09/model.int8.onnx ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/0.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/1.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/8k.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/2-zh-en.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/5-henan.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-zh-2024-03-09/model.int8.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/0.wav +{"text": "对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你", "timestamps": [], "tokens":["对", "我", "做", "了", "介", "绍", "啊", "那", "么", "我", "想", "说", "的", "是", "呢", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣", "呢", "你"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/1.wav +{"text": "重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现", "timestamps": [], "tokens":["重", "点", "呢", "想", "谈", "三", "个", "问", "题", "首", "先", "呢", "就", "是", "这", "一", "轮", "全", "球", "金", "融", "动", "荡", "的", "表", "现"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/8k.wav +{"text": "深入的分析这一次全球金融动荡背后的根源", "timestamps": [], "tokens":["深", "入", "的", "分", "析", "这", "一", "次", "全", "球", "金", "融", "动", "荡", "背", "后", "的", "根", "源"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/2-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["ye@@", "ster@@", "day", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演的特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "的", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/4-tianjin.wav +{"text": "其实他就是怕每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "怕", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/5-henan.wav +{"text": "他这个管一向都通到有时候都通到七八层楼高然后他的管一向就可以交到那个那柱子上", "timestamps": [], "tokens":["他", "这", "个", "管", "一", "向", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "他", "的", "管", "一", "向", "就", "可", "以", "交", "到", "那", "个", "那", "柱", "子", "上"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 7.133 s +Real time factor (RTF): 7.133 / 47.023 = 0.152 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09.txt new file mode 100644 index 000000000..2ddcfb3f7 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09.txt @@ -0,0 +1,36 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt --paraformer=./sherpa-onnx-paraformer-zh-2024-03-09/model.onnx ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/0.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/1.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/8k.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/2-zh-en.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/5-henan.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-zh-2024-03-09/model.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/0.wav +{"text": "对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你", "timestamps": [], "tokens":["对", "我", "做", "了", "介", "绍", "啊", "那", "么", "我", "想", "说", "的", "是", "呢", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣", "呢", "你"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/1.wav +{"text": "重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现", "timestamps": [], "tokens":["重", "点", "呢", "想", "谈", "三", "个", "问", "题", "首", "先", "呢", "就", "是", "这", "一", "轮", "全", "球", "金", "融", "动", "荡", "的", "表", "现"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/8k.wav +{"text": "深入的分析这一次全球金融动荡背后的根源", "timestamps": [], "tokens":["深", "入", "的", "分", "析", "这", "一", "次", "全", "球", "金", "融", "动", "荡", "背", "后", "的", "根", "源"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/2-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["ye@@", "ster@@", "day", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演的特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "的", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/4-tianjin.wav +{"text": "其实他就是怕每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "怕", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/5-henan.wav +{"text": "他这个管一向都通到有时候都通到七八层楼高然后他的管一向就可以交到那个那柱子上", "timestamps": [], "tokens":["他", "这", "个", "管", "一", "向", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "后", "他", "的", "管", "一", "向", "就", "可", "以", "交", "到", "那", "个", "那", "柱", "子", "上"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 6.829 s +Real time factor (RTF): 6.829 / 47.023 = 0.145 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-small-2024-03-09-int8.txt b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-small-2024-03-09-int8.txt new file mode 100644 index 000000000..e2ea10c8e --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/code-paraformer/sherpa-onnx-paraformer-zh-small-2024-03-09-int8.txt @@ -0,0 +1,36 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-paraformer-zh-small-2024-03-09/tokens.txt --paraformer=./sherpa-onnx-paraformer-zh-small-2024-03-09/model.int8.onnx ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/0.wav ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/1.wav ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/8k.wav ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/2-zh-en.wav ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/3-sichuan.wav ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/4-tianjin.wav ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/5-henan.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model="./sherpa-onnx-paraformer-zh-small-2024-03-09/model.int8.onnx"), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-paraformer-zh-small-2024-03-09/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +/project/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:119 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/0.wav +{"text": "对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢", "timestamps": [], "tokens":["对", "我", "做", "了", "介", "绍", "啊", "那", "么", "我", "想", "说", "的", "是", "呢", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣", "呢"]} +---- +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/1.wav +{"text": "重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现", "timestamps": [], "tokens":["重", "点", "呢", "想", "谈", "三", "个", "问", "题", "首", "先", "呢", "就", "是", "这", "一", "轮", "全", "球", "金", "融", "动", "荡", "的", "表", "现"]} +---- +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/8k.wav +{"text": "深入的分析这一次全球金融动荡背后的根源", "timestamps": [], "tokens":["深", "入", "的", "分", "析", "这", "一", "次", "全", "球", "金", "融", "动", "荡", "背", "后", "的", "根", "源"]} +---- +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/2-zh-en.wav +{"text": " yesterday was 星期一 today is tuesday 明天是星期三", "timestamps": [], "tokens":["ye@@", "ster@@", "day", "was", "星", "期", "一", "today", "is", "tu@@", "es@@", "day", "明", "天", "是", "星", "期", "三"]} +---- +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情节里面就是感觉是演的特别好就是好像很真实一样你知道吧", "timestamps": [], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "节", "里", "面", "就", "是", "感", "觉", "是", "演", "的", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "一", "样", "你", "知", "道", "吧"]} +---- +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/4-tianjin.wav +{"text": "其实他就是怕每个人都可以守法就这意思法律意识太单薄了而且就是嗯也不顾及到别人的感受", "timestamps": [], "tokens":["其", "实", "他", "就", "是", "怕", "每", "个", "人", "都", "可", "以", "守", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "嗯", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/5-henan.wav +{"text": "他这个管一向都通到有时候都通到七八层楼缸然后他这管一向就可以浇到那个那柱子上", "timestamps": [], "tokens":["他", "这", "个", "管", "一", "向", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "缸", "然", "后", "他", "这", "管", "一", "向", "就", "可", "以", "浇", "到", "那", "个", "那", "柱", "子", "上"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 3.562 s +Real time factor (RTF): 3.562 / 47.023 = 0.076 diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/index.rst b/docs/source/onnx/pretrained_models/offline-paraformer/index.rst new file mode 100644 index 000000000..70429eaa5 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/index.rst @@ -0,0 +1,10 @@ +Offline paraformer models +========================= + +This section lists available offline paraformer models. + +.. toctree:: + :maxdepth: 5 + + paraformer-models + diff --git a/docs/source/onnx/pretrained_models/offline-paraformer/paraformer-models.rst b/docs/source/onnx/pretrained_models/offline-paraformer/paraformer-models.rst new file mode 100644 index 000000000..200f3224b --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-paraformer/paraformer-models.rst @@ -0,0 +1,708 @@ +Paraformer models +================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +.. _sherpa_onnx_offline_paraformer_trilingual_zh_cantonese_en: + +csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en (Chinese + English + Cantonese 粤语) +------------------------------------------------------------------------------------------------- + +.. note:: + + This model does not support timestamps. It is a trilingual model, supporting + both Chinese and English. (支持普通话、``粤语``、河南话、天津话、四川话等方言) + +This model is converted from + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 + + tar xvf sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-paraformer-trilingual-zh-cantonese-en$ ls -lh *.onnx + + -rw-r--r-- 1 1001 127 234M Mar 10 02:12 model.int8.onnx + -rw-r--r-- 1 1001 127 831M Mar 10 02:12 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.onnx \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/5-henan.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/6-zh-en.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/5-henan.wav \ + ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/test_wavs/6-zh-en.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-trilingual-zh-cantonese-en-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx + +.. _sherpa_onnx_offline_paraformer_en_2024_03_09_english: + +csukuangfj/sherpa-onnx-paraformer-en-2024-03-09 (English) +--------------------------------------------------------- + +.. note:: + + This model does not support timestamps. It supports only English. + +This model is converted from + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-en-2024-03-09.tar.bz2 + + tar xvf sherpa-onnx-paraformer-en-2024-03-09.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-paraformer-en-2024-03-09$ ls -lh *.onnx + + -rw-r--r-- 1 1001 127 220M Mar 10 02:12 model.int8.onnx + -rw-r--r-- 1 1001 127 817M Mar 10 02:12 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-en-2024-03-09/model.onnx \ + ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-en-2024-03-09.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-en-2024-03-09/model.int8.onnx \ + ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-en-2024-03-09/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-en-2024-03-09-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-paraformer-en-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-en-2024-03-09/model.int8.onnx + +.. _sherpa_onnx_offline_paraformer_zh_small_2024_03_09_chinese_english: + +csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09 (Chinese + English) +------------------------------------------------------------------------- + +.. note:: + + This model does not support timestamps. It is a bilingual model, supporting + both Chinese and English. (支持普通话、河南话、天津话、四川话等方言) + +This model is converted from + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-small-2024-03-09.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-small-2024-03-09.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-paraformer-zh-small-2024-03-09$ ls -lh *.onnx + + -rw-r--r-- 1 1001 127 79M Mar 10 00:48 model.int8.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-zh-small-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-small-2024-03-09/model.int8.onnx \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/8k.wav \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/2-zh-en.wav \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/5-henan.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-zh-small-2024-03-09-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-paraformer-zh-small-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-small-2024-03-09/model.int8.onnx + +.. _sherpa_onnx_offline_paraformer_zh_2024_03_09_chinese_english: + +csukuangfj/sherpa-onnx-paraformer-zh-2024-03-09 (Chinese + English) +------------------------------------------------------------------------- + +.. note:: + + This model does not support timestamps. It is a bilingual model, supporting + both Chinese and English. (支持普通话、河南话、天津话、四川话等方言) + +This model is converted from + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2024-03-09.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2024-03-09.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-paraformer-zh-2024-03-09$ ls -lh *.onnx + + -rw-r--r-- 1 1001 127 217M Mar 10 02:22 model.int8.onnx + -rw-r--r-- 1 1001 127 785M Mar 10 02:22 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2024-03-09/model.onnx \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/8k.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/2-zh-en.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/5-henan.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2024-03-09/model.int8.onnx \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/8k.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/2-zh-en.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-zh-2024-03-09/test_wavs/5-henan.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-zh-2024-03-09.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2024-03-09/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2024-03-09/model.int8.onnx + + +.. _sherpa_onnx_offline_paraformer_zh_2023_03_28_chinese: + +csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28 (Chinese + English) +------------------------------------------------------------------- + +.. note:: + + This model does not support timestamps. It is a bilingual model, supporting + both Chinese and English. (支持普通话、河南话、天津话、四川话等方言) + + +This model is converted from + +``_ + +The code for converting can be found at + +``_ + + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-paraformer-zh-2023-03-28$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 214M Apr 1 07:28 model.int8.onnx + -rw-r--r-- 1 kuangfangjun root 824M Apr 1 07:28 model.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/6-zh-en.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/5-henan.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/6-zh-en.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-zh-2023-03-28-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx + +.. _sherpa_onnx_offline_paraformer_zh_2023_09_14_chinese: + +csukuangfj/sherpa-onnx-paraformer-zh-2023-09-14 (Chinese + English)) +--------------------------------------------------------------------- + +.. note:: + + This model supports timestamps. It is a bilingual model, supporting + both Chinese and English. (支持普通话、河南话、天津话、四川话等方言) + + +This model is converted from + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-paraformer-zh-2023-09-14$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 232M Sep 14 13:46 model.int8.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx \ + --model-type=paraformer \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/5-henan.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/6-zh-en.wav \ + ./sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-paraformer-zh-2023-09-14-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx \ + --model-type=paraformer diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-en-2023-03-18-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-en-2023-03-18-int8.txt new file mode 100644 index 000000000..09dec70cd --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-en-2023-03-18-int8.txt @@ -0,0 +1,24 @@ +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-conformer-en-2023-03-18/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-conformer-en-2023-03-18/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-conformer-en-2023-03-18/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), tokens="./sherpa-onnx-conformer-en-2023-03-18/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +2023-04-01 07:13:26.514109433 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 608419, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 07:13:26.514112711 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 608420, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +Started +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-conformer-en-2023-03-18/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---- +./sherpa-onnx-conformer-en-2023-03-18/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---- +./sherpa-onnx-conformer-en-2023-03-18/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.370 s +Real time factor (RTF): 1.370 / 28.165 = 0.049 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-en-2023-03-18.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-en-2023-03-18.txt new file mode 100644 index 000000000..476cb2d4c --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-en-2023-03-18.txt @@ -0,0 +1,24 @@ +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-conformer-en-2023-03-18/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-conformer-en-2023-03-18/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-conformer-en-2023-03-18/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), tokens="./sherpa-onnx-conformer-en-2023-03-18/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +2023-04-01 07:11:51.666456713 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 608379, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 07:11:51.666458525 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 608380, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +Started +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-conformer-en-2023-03-18/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---- +./sherpa-onnx-conformer-en-2023-03-18/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---- +./sherpa-onnx-conformer-en-2023-03-18/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 2.264 s +Real time factor (RTF): 2.264 / 28.165 = 0.080 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-2023-05-23.int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-2023-05-23.int8.txt new file mode 100644 index 000000000..0723c3d6d --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-2023-05-23.int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --decoding-method=greedy_search --tokens=./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt --encoder=./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/0.wav ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/1.wav ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/2.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/0.wav +{"text":"对我做了介绍那么我想说的是呢大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.12, 0.52, 0.64, 0.84, 1.04, 1.68, 1.80, 1.92, 2.08, 2.32, 2.48, 2.64, 2.76, 3.08, 3.20, 3.44, 3.52, 3.64, 3.76, 3.88, 4.00, 4.16, 4.32, 4.48, 4.60, 4.84]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","呢","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +---- +./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/1.wav +{"text":"重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现","timestamps":"[0.04, 0.16, 0.36, 0.48, 0.68, 0.92, 1.08, 1.24, 1.44, 1.88, 2.08, 2.36, 2.52, 2.64, 2.88, 3.00, 3.16, 3.40, 3.56, 3.72, 3.84, 4.04, 4.20, 4.32, 4.56, 4.76]","tokens":["重","点","呢","想","谈","三","个","问","题","首","先","呢","就","是","这","一","轮","全","球","金","融","动","荡","的","表","现"]} +---- +./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/2.wav +{"text":"深度地分析这一次全球金融动荡背后的根源","timestamps":"[0.00, 0.12, 0.60, 0.84, 1.04, 1.44, 1.64, 1.84, 2.28, 2.52, 2.80, 2.92, 3.08, 3.28, 3.36, 3.60, 3.72, 3.84, 4.12]","tokens":["深","度","地","分","析","这","一","次","全","球","金","融","动","荡","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.502 s +Real time factor (RTF): 0.502 / 15.289 = 0.033 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-2023-05-23.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-2023-05-23.txt new file mode 100644 index 000000000..e8ed1d8e1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-2023-05-23.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt --encoder=./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/0.wav ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/1.wav ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/2.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/0.wav +{"text":"对我做了介绍那么我想说的是呢大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.12, 0.52, 0.64, 0.84, 1.04, 1.68, 1.80, 1.92, 2.12, 2.32, 2.48, 2.64, 2.76, 3.08, 3.20, 3.44, 3.52, 3.64, 3.76, 3.88, 4.00, 4.16, 4.32, 4.48, 4.64, 4.84]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","呢","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +---- +./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/1.wav +{"text":"重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现","timestamps":"[0.04, 0.16, 0.36, 0.48, 0.68, 0.92, 1.08, 1.24, 1.44, 1.84, 2.08, 2.36, 2.52, 2.68, 2.88, 3.04, 3.16, 3.40, 3.56, 3.72, 3.84, 4.04, 4.16, 4.32, 4.56, 4.76]","tokens":["重","点","呢","想","谈","三","个","问","题","首","先","呢","就","是","这","一","轮","全","球","金","融","动","荡","的","表","现"]} +---- +./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/2.wav +{"text":"深度地分析这一次全球金融动荡背后的根源","timestamps":"[0.00, 0.12, 0.60, 0.84, 1.04, 1.44, 1.68, 1.84, 2.28, 2.52, 2.80, 2.92, 3.08, 3.24, 3.40, 3.60, 3.72, 3.84, 4.12]","tokens":["深","度","地","分","析","这","一","次","全","球","金","融","动","荡","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.706 s +Real time factor (RTF): 0.706 / 15.289 = 0.046 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.int8.txt new file mode 100644 index 000000000..7295f7800 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt --encoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/0.wav ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/1.wav ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/2.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/0.wav +{"text":"对我做了介绍那么我想说的是呢大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.12, 0.44, 0.64, 0.84, 1.08, 1.64, 1.72, 1.88, 2.08, 2.28, 2.44, 2.56, 2.76, 3.08, 3.20, 3.32, 3.48, 3.64, 3.76, 3.88, 4.00, 4.16, 4.24, 4.48, 4.60, 4.84]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","呢","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +---- +./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/1.wav +{"text":"重点想谈三个问题首先呢就是这一轮全球金融动荡的表现","timestamps":"[0.00, 0.08, 0.48, 0.64, 0.88, 1.08, 1.28, 1.48, 1.80, 2.08, 2.40, 2.56, 2.68, 2.88, 3.04, 3.16, 3.36, 3.56, 3.68, 3.84, 4.00, 4.16, 4.32, 4.56, 4.76]","tokens":["重","点","想","谈","三","个","问","题","首","先","呢","就","是","这","一","轮","全","球","金","融","动","荡","的","表","现"]} +---- +./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/2.wav +{"text":"深入地分析这一次全球金融动荡背后的根源","timestamps":"[0.00, 0.12, 0.56, 0.84, 1.08, 1.40, 1.64, 1.84, 2.24, 2.52, 2.72, 2.92, 3.08, 3.24, 3.40, 3.56, 3.72, 3.88, 4.12]","tokens":["深","入","地","分","析","这","一","次","全","球","金","融","动","荡","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.439 s +Real time factor (RTF): 0.439 / 15.289 = 0.029 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.txt new file mode 100644 index 000000000..4f9a05782 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt --encoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/0.wav ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/1.wav ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/2.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/0.wav +{"text":"对我做了介绍那么我想说的是呢大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.12, 0.44, 0.64, 0.84, 1.04, 1.64, 1.72, 1.88, 2.08, 2.28, 2.44, 2.56, 2.76, 3.08, 3.20, 3.32, 3.48, 3.64, 3.76, 3.88, 4.00, 4.16, 4.24, 4.44, 4.60, 4.84]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","呢","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +---- +./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/1.wav +{"text":"重点想谈三个问题首先呢就是这一轮全球金融动荡的表现","timestamps":"[0.00, 0.12, 0.48, 0.64, 0.88, 1.08, 1.28, 1.48, 1.80, 2.12, 2.40, 2.56, 2.68, 2.88, 3.04, 3.16, 3.36, 3.56, 3.68, 3.84, 4.00, 4.16, 4.32, 4.56, 4.76]","tokens":["重","点","想","谈","三","个","问","题","首","先","呢","就","是","这","一","轮","全","球","金","融","动","荡","的","表","现"]} +---- +./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/2.wav +{"text":"深入地分析这一次全球金融动荡背后的根源","timestamps":"[0.00, 0.16, 0.60, 0.88, 1.08, 1.36, 1.64, 1.84, 2.24, 2.52, 2.72, 2.92, 3.08, 3.24, 3.40, 3.56, 3.72, 3.88, 4.12]","tokens":["深","入","地","分","析","这","一","次","全","球","金","融","动","荡","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.596 s +Real time factor (RTF): 0.596 / 15.289 = 0.039 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-nemo/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-nemo/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.int8.txt new file mode 100644 index 000000000..a5d249ea4 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-nemo/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --encoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/encoder.int8.onnx --decoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/decoder.onnx --joiner=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/joiner.onnx --tokens=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/tokens.txt --model-type=nemo_transducer ./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/test_wavs/example.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/encoder.int8.onnx", decoder_filename="./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/decoder.onnx", joiner_filename="./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/joiner.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="nemo_transducer", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/test_wavs/example.wav +{"lang": "", "emotion": "", "event": "", "text": " ничьих не требуя похвал счастлив уж я надеждой сладкой что дева с трепетом любви посмотрит может быть украдкой на песни грешные мои у лукоморья дуб зеленый", "timestamps": [0.04, 0.16, 0.24, 0.28, 0.40, 0.48, 0.60, 0.68, 0.80, 0.92, 1.04, 1.20, 1.28, 1.44, 1.76, 1.88, 2.00, 2.08, 2.16, 2.28, 2.36, 2.44, 2.64, 2.76, 2.92, 3.00, 3.04, 3.16, 3.24, 3.36, 3.48, 3.56, 3.68, 3.88, 4.04, 4.16, 4.24, 4.32, 4.40, 4.56, 4.76, 4.88, 4.92, 5.36, 5.64, 5.84, 5.92, 6.04, 6.32, 6.52, 6.60, 6.72, 6.84, 6.92, 7.04, 7.16, 7.28, 7.36, 7.44, 7.56, 7.68, 7.72, 7.88, 8.00, 8.20, 8.36, 9.28, 9.40, 9.44, 9.52, 9.68, 9.84, 9.88, 9.92, 10.12, 10.32, 10.40, 10.52, 10.56, 10.76, 10.84], "tokens":[" ни", "ч", "ь", "и", "х", " не", " т", "ре", "бу", "я", " по", "х", "ва", "л", " с", "ча", "ст", "ли", "в", " у", "ж", " я", " на", "де", "ж", "до", "й", " с", "ла", "д", "ко", "й", " что", " де", "ва", " с", " т", "ре", "пе", "том", " лю", "б", "ви", " пос", "мот", "ри", "т", " может", " быть", " у", "к", "ра", "д", "ко", "й", " на", " п", "е", "с", "ни", " г", "ре", "ш", "ные", " мо", "и", " у", " ", "лу", "ко", "мо", "р", "ь", "я", " ду", "б", " з", "е", "лен", "ы", "й"], "words": []} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.775 s +Real time factor (RTF): 1.775 / 11.290 = 0.157 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17-int8.txt new file mode 100644 index 000000000..28676c6ca --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17-int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt --encoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.int8.onnx --decoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx --joiner=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.int8.onnx ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1089-134686-0001.wav ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0001.wav ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.int8.onnx", decoder_filename="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx", joiner_filename="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +Done! + +./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.64, 0.76, 0.84, 1.04, 1.08, 1.16, 1.36, 1.44, 1.56, 1.72, 1.84, 1.88, 1.92, 1.96, 2.04, 2.20, 2.32, 2.48, 2.56, 2.76, 2.80, 2.84, 3.08, 3.28, 3.40, 3.52, 3.68, 4.00, 4.24, 4.28, 4.52, 4.68, 4.84, 4.88, 4.96, 5.04, 5.28, 5.36, 5.52, 5.72, 5.88, 6.08]","tokens":[" AFTER"," E","AR","LY"," ","N","IGHT","F","AL","L"," THE"," ","Y","E","LL","OW"," LA","MP","S"," WOULD"," ","L","IGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," BRO","TH","EL","S"]} +---- +./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.04, 0.44, 0.64, 0.84, 0.96, 1.32, 1.52, 1.68, 1.84, 1.88, 2.04, 2.16, 2.32, 2.40, 2.64, 2.88, 3.12, 3.24, 3.44, 3.52, 3.72, 3.88, 4.20, 4.40, 4.48, 4.60, 4.76, 4.96, 5.08, 5.24, 5.36, 5.56, 5.80, 6.20, 6.32, 6.52, 6.92, 7.16, 7.32, 7.60, 7.76, 7.92, 8.16, 8.28, 8.40, 8.48, 8.60, 8.76, 8.84, 9.08, 9.24, 9.44, 9.48, 9.72, 9.88, 10.04, 10.12, 10.52, 10.76, 10.84, 11.08, 11.24, 11.36, 11.60, 11.76, 11.96, 12.08, 12.24, 12.28, 12.48, 12.72, 12.84, 12.92, 13.00, 13.20, 13.52, 13.76, 13.88, 14.08, 14.28, 14.52, 14.64, 14.76, 14.96, 15.04, 15.24, 15.48, 15.68, 15.84, 16.00, 16.04]","tokens":[" GO","D"," AS"," A"," DIRECT"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," G","IVE","N"," HER"," A"," LO","VE","LY"," CHI","LD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SA","ME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","N","ECT"," HER"," PA","R","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FIN","ALLY"," A"," B","LES","S","ED"," SO","UL"," IN"," HE","A","VEN"]} +---- +./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRIN LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.04, 0.12, 0.56, 0.80, 0.88, 1.00, 1.04, 1.12, 1.20, 1.28, 1.40, 1.52, 1.64, 1.76, 1.84, 2.04, 2.24, 2.40, 2.64, 2.68, 2.84, 3.04, 3.24, 3.44, 3.52, 3.72, 3.92, 4.00, 4.16, 4.24, 4.36]","tokens":[" ","Y","ET"," THESE"," TH","O","UGH","T","S"," A","FF","ECT","ED"," HE","S","TER"," PRI","N"," LE","S","S"," WITH"," HO","PE"," TH","AN"," APP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.368 s +Real time factor (RTF): 1.368 / 28.165 = 0.049 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.txt new file mode 100644 index 000000000..e339e7994 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt --encoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.onnx --decoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx --joiner=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.onnx ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1089-134686-0001.wav ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0001.wav ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.onnx", decoder_filename="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx", joiner_filename="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +Done! + +./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.64, 0.76, 0.84, 1.04, 1.08, 1.16, 1.32, 1.44, 1.56, 1.72, 1.84, 1.88, 1.92, 1.96, 2.04, 2.16, 2.32, 2.48, 2.56, 2.76, 2.80, 2.84, 3.08, 3.28, 3.40, 3.52, 3.68, 4.00, 4.24, 4.28, 4.52, 4.68, 4.84, 4.88, 4.96, 5.04, 5.28, 5.40, 5.52, 5.72, 5.88, 6.08]","tokens":[" AFTER"," E","AR","LY"," ","N","IGHT","F","AL","L"," THE"," ","Y","E","LL","OW"," LA","MP","S"," WOULD"," ","L","IGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," BRO","TH","EL","S"]} +---- +./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.04, 0.44, 0.64, 0.84, 0.96, 1.32, 1.52, 1.68, 1.84, 1.88, 2.04, 2.16, 2.32, 2.40, 2.64, 2.88, 3.12, 3.24, 3.44, 3.52, 3.72, 3.88, 4.20, 4.40, 4.48, 4.60, 4.76, 4.96, 5.08, 5.24, 5.36, 5.56, 5.80, 6.20, 6.32, 6.52, 6.92, 7.16, 7.36, 7.60, 7.76, 7.92, 8.16, 8.28, 8.40, 8.48, 8.60, 8.76, 8.84, 9.08, 9.24, 9.44, 9.48, 9.72, 9.88, 10.04, 10.12, 10.52, 10.76, 10.84, 11.08, 11.24, 11.36, 11.60, 11.76, 11.96, 12.08, 12.24, 12.28, 12.48, 12.72, 12.84, 12.92, 13.00, 13.20, 13.52, 13.76, 13.88, 14.08, 14.28, 14.52, 14.64, 14.76, 14.96, 15.04, 15.24, 15.48, 15.68, 15.84, 16.00, 16.04]","tokens":[" GO","D"," AS"," A"," DIRECT"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," G","IVE","N"," HER"," A"," LO","VE","LY"," CHI","LD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SA","ME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","N","ECT"," HER"," PA","R","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FIN","ALLY"," A"," B","LES","S","ED"," SO","UL"," IN"," HE","A","VEN"]} +---- +./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRIN LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.04, 0.12, 0.56, 0.80, 0.88, 1.00, 1.04, 1.12, 1.20, 1.28, 1.40, 1.52, 1.64, 1.76, 1.84, 2.04, 2.24, 2.40, 2.64, 2.68, 2.84, 3.04, 3.24, 3.44, 3.52, 3.72, 3.92, 4.00, 4.16, 4.24, 4.36]","tokens":[" ","Y","ET"," THESE"," TH","O","UGH","T","S"," A","FF","ECT","ED"," HE","S","TER"," PRI","N"," LE","S","S"," WITH"," HO","PE"," TH","AN"," APP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.611 s +Real time factor (RTF): 1.611 / 28.165 = 0.057 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-zipformer-wenetspeech-20230615-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-zipformer-wenetspeech-20230615-int8.txt new file mode 100644 index 000000000..617e48d10 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-zipformer-wenetspeech-20230615-int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt --encoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.int8.onnx --decoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx --joiner=./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.int8.onnx ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000001.wav ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.int8.onnx", decoder_filename="./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx", joiner_filename="./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +Done! + +./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav +{"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.12, 0.48, 0.60, 0.80, 1.08, 1.64, 1.76, 1.92, 2.08, 2.32, 2.48, 2.64, 3.08, 3.20, 3.28, 3.44, 3.60, 3.72, 3.84, 3.92, 4.12, 4.28, 4.48, 4.72, 4.84]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +---- +./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000001.wav +{"text":"重点想谈三个问题首先呢就是这一轮全球金融动荡的表现","timestamps":"[0.00, 0.16, 0.48, 0.68, 0.84, 1.08, 1.20, 1.48, 1.64, 2.08, 2.36, 2.52, 2.64, 2.84, 3.00, 3.16, 3.40, 3.52, 3.72, 3.84, 4.00, 4.16, 4.32, 4.56, 4.84]","tokens":["重","点","想","谈","三","个","问","题","首","先","呢","就","是","这","一","轮","全","球","金","融","动","荡","的","表","现"]} +---- +./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000002.wav +{"text":"深入地分析这一次全球金融动荡荡背后的根源","timestamps":"[0.00, 0.12, 0.48, 0.84, 1.08, 1.44, 1.60, 1.84, 2.24, 2.48, 2.76, 2.88, 3.12, 3.24, 3.28, 3.36, 3.60, 3.72, 3.84, 4.16]","tokens":["深","入","地","分","析","这","一","次","全","球","金","融","动","荡","荡","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.338 s +Real time factor (RTF): 0.338 / 15.289 = 0.022 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-zipformer-wenetspeech-20230615.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-zipformer-wenetspeech-20230615.txt new file mode 100644 index 000000000..a00da7d24 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/icefall-asr-zipformer-wenetspeech-20230615.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt --encoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.onnx --decoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx --joiner=./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.onnx ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000001.wav ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.onnx", decoder_filename="./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx", joiner_filename="./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +Done! + +./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav +{"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.12, 0.48, 0.64, 0.88, 1.16, 1.64, 1.76, 1.92, 2.08, 2.32, 2.48, 2.64, 3.08, 3.20, 3.40, 3.48, 3.64, 3.76, 3.88, 3.96, 4.12, 4.28, 4.52, 4.72, 4.84]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +---- +./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000001.wav +{"text":"重点想谈三个问题首先就是这一轮全球金融动荡的表现","timestamps":"[0.00, 0.16, 0.48, 0.72, 0.92, 1.08, 1.28, 1.52, 1.92, 2.08, 2.52, 2.64, 2.88, 3.04, 3.20, 3.40, 3.56, 3.76, 3.84, 4.00, 4.16, 4.32, 4.56, 4.84]","tokens":["重","点","想","谈","三","个","问","题","首","先","就","是","这","一","轮","全","球","金","融","动","荡","的","表","现"]} +---- +./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000002.wav +{"text":"深入地分析这一次全球金融动荡背后的根源","timestamps":"[0.00, 0.32, 0.56, 0.84, 1.12, 1.44, 1.68, 1.84, 2.28, 2.48, 2.76, 2.92, 3.12, 3.28, 3.44, 3.60, 3.72, 3.92, 4.20]","tokens":["深","入","地","分","析","这","一","次","全","球","金","融","动","荡","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.458 s +Real time factor (RTF): 0.458 / 15.289 = 0.030 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.int8.txt new file mode 100644 index 000000000..1008d541f --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt --encoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.int8.onnx --decoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx --joiner=./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.int8.onnx --num-threads=1 ./sherpa-onnx-small-zipformer-ru-2024-09-18/test_wavs/1.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.int8.onnx", decoder_filename="./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx", joiner_filename="./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-small-zipformer-ru-2024-09-18/test_wavs/1.wav +{"lang": "", "emotion": "", "event": "", "text": " родион потапыч высчитывал каждый новый вершок углубления и давно определил про себя", "timestamps": [0.00, 0.20, 0.28, 0.48, 0.68, 0.84, 0.92, 1.04, 1.48, 1.64, 1.76, 1.92, 2.08, 2.16, 2.40, 2.52, 2.60, 2.84, 3.00, 3.04, 3.20, 3.40, 3.48, 3.60, 3.68, 3.80, 3.88, 4.00, 4.12, 4.16, 4.72, 4.92, 5.12, 5.20, 5.48, 5.60, 5.68, 5.92, 6.28, 6.48], "tokens":[" ро", "ди", "он", " по", "та", "п", "ы", "ч", " вы", "с", "чи", "ты", "ва", "л", " ка", "жд", "ый", " но", "в", "ый", " вер", "ш", "о", "к", " у", "г", "лу", "б", "л", "ения", " и", " да", "в", "но", " оп", "ре", "дел", "ил", " про", " себя"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.183 s +Real time factor (RTF): 0.183 / 7.080 = 0.026 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.txt new file mode 100644 index 000000000..a0897bf4d --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt --encoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.onnx --decoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx --joiner=./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.onnx --num-threads=1 ./sherpa-onnx-small-zipformer-ru-2024-09-18/test_wavs/1.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.onnx", decoder_filename="./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx", joiner_filename="./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-small-zipformer-ru-2024-09-18/test_wavs/1.wav +{"lang": "", "emotion": "", "event": "", "text": " родион потапыч высчитывал каждый новый вершок углубления и давно определил про себя", "timestamps": [0.00, 0.20, 0.28, 0.48, 0.68, 0.84, 0.92, 1.04, 1.48, 1.64, 1.76, 1.92, 2.08, 2.16, 2.40, 2.52, 2.60, 2.84, 3.00, 3.04, 3.20, 3.40, 3.48, 3.60, 3.68, 3.80, 3.88, 4.00, 4.12, 4.16, 4.72, 4.92, 5.12, 5.20, 5.48, 5.60, 5.68, 5.92, 6.28, 6.48], "tokens":[" ро", "ди", "он", " по", "та", "п", "ы", "ч", " вы", "с", "чи", "ты", "ва", "л", " ка", "жд", "ый", " но", "в", "ый", " вер", "ш", "о", "к", " у", "г", "лу", "б", "л", "ения", " и", " да", "в", "но", " оп", "ре", "дел", "ил", " про", " себя"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.228 s +Real time factor (RTF): 0.228 / 7.080 = 0.032 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13-int8.txt new file mode 100644 index 000000000..4ad010609 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13-int8.txt @@ -0,0 +1,17 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --blank-penalty=1.2 --tokens=./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt --encoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.int8.onnx --decoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx --joiner=./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.int8.onnx ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_1.wav ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_2.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx", joiner_filename="./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=1.2) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_1.wav +{"text": "啊有冇人知道灣仔活道係點去㗎", "timestamps": [0.00, 0.88, 1.28, 1.52, 1.84, 2.08, 2.32, 2.56, 2.80, 3.04, 3.20, 3.44, 3.68, 3.92], "tokens":["啊", "有", "冇", "人", "知", "道", "灣", "仔", "活", "道", "係", "點", "去", "㗎"]} +---- +./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_2.wav +{"text": "我喺黃大仙九龍塘聯合到當失路啊", "timestamps": [0.00, 0.64, 0.88, 1.12, 1.28, 1.60, 1.80, 2.16, 2.36, 2.56, 2.88, 3.08, 3.32, 3.44, 3.60], "tokens":["我", "喺", "黃", "大", "仙", "九", "龍", "塘", "聯", "合", "到", "當", "失", "路", "啊"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.907 s +Real time factor (RTF): 0.907 / 10.320 = 0.088 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13.txt new file mode 100644 index 000000000..2b1f3785c --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13.txt @@ -0,0 +1,17 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --blank-penalty=1.2 --tokens=./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt --encoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.onnx --decoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx --joiner=./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.onnx ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_1.wav ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_2.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.onnx", decoder_filename="./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx", joiner_filename="./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=1.2) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_1.wav +{"text": "啊有冇人知道灣仔活道係點去㗎", "timestamps": [0.00, 0.88, 1.28, 1.52, 1.84, 2.08, 2.32, 2.56, 2.80, 3.04, 3.20, 3.44, 3.68, 3.92], "tokens":["啊", "有", "冇", "人", "知", "道", "灣", "仔", "活", "道", "係", "點", "去", "㗎"]} +---- +./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_2.wav +{"text": "我喺黃大仙九龍塘聯合到當失路啊", "timestamps": [0.00, 0.64, 0.88, 1.12, 1.28, 1.60, 1.80, 2.16, 2.36, 2.56, 2.88, 3.08, 3.32, 3.44, 3.60], "tokens":["我", "喺", "黃", "大", "仙", "九", "龍", "塘", "聯", "合", "到", "當", "失", "路", "啊"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.349 s +Real time factor (RTF): 1.349 / 10.320 = 0.131 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-03-30-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-03-30-int8.txt new file mode 100644 index 000000000..6b333929b --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-03-30-int8.txt @@ -0,0 +1,24 @@ +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-en-2023-03-30/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-en-2023-03-30/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-en-2023-03-30/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), tokens="./sherpa-onnx-zipformer-en-2023-03-30/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +2023-04-01 06:49:34.370117205 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 607732, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 06:49:34.370115197 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 607731, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +Started +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---- +./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---- +./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.710 s +Real time factor (RTF): 1.710 / 28.165 = 0.061 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-03-30.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-03-30.txt new file mode 100644 index 000000000..c1506934b --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-03-30.txt @@ -0,0 +1,24 @@ +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-en-2023-03-30/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-en-2023-03-30/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-en-2023-03-30/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), tokens="./sherpa-onnx-zipformer-en-2023-03-30/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +2023-04-01 06:47:56.620698024 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 607690, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 06:47:56.620700026 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 607691, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +Started +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---- +./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---- +./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.950 s +Real time factor (RTF): 1.950 / 28.165 = 0.069 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-04-01-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-04-01-int8.txt new file mode 100644 index 000000000..6c6a07162 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-04-01-int8.txt @@ -0,0 +1,24 @@ +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), tokens="./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +2023-04-01 14:42:00.407939001 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 638195, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 14:42:00.407940827 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 638196, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +Started +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---- +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---- +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.478 s +Real time factor (RTF): 1.478 / 28.165 = 0.052 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-04-01.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-04-01.txt new file mode 100644 index 000000000..71d14b8b3 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-04-01.txt @@ -0,0 +1,24 @@ +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), tokens="./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt", num_threads=2, debug=False), decoding_method="greedy_search") +Creating recognizer ... +2023-04-01 14:40:56.353883875 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 638155, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 14:40:56.353881478 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 638154, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +Started +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---- +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---- +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 2.151 s +Real time factor (RTF): 2.151 / 28.165 = 0.076 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-06-26-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-06-26-int8.txt new file mode 100644 index 000000000..defee5e7f --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-06-26-int8.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:108 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.56, 0.64, 0.80, 1.08, 1.36, 1.40, 1.52, 1.68, 1.84, 1.96, 2.04, 2.20, 2.32, 2.40, 2.48, 2.60, 2.76, 3.04, 3.28, 3.40, 3.56, 3.76, 4.08, 4.24, 4.32, 4.48, 4.64, 4.80, 4.84, 5.00, 5.04, 5.28, 5.40, 5.56, 5.60, 5.76, 5.96, 6.12]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00, 0.24, 0.56, 0.76, 0.92, 1.04, 1.16, 1.20, 1.36, 1.52, 1.64, 1.80, 1.88, 2.00, 2.16, 2.32, 2.40, 2.64, 2.88, 3.12, 3.24, 3.48, 3.56, 3.72, 3.92, 4.12, 4.40, 4.52, 4.72, 4.96, 5.12, 5.40, 5.64, 6.12, 6.28, 6.52, 6.88, 7.12, 7.32, 7.60, 7.92, 8.16, 8.28, 8.40, 8.48, 8.64, 8.76, 8.88, 9.04, 9.28, 9.44, 9.52, 9.60, 9.72, 9.92, 9.96, 10.16, 10.48, 10.72, 10.80, 11.04, 11.20, 11.36, 11.56, 11.76, 12.00, 12.12, 12.28, 12.32, 12.52, 12.72, 12.84, 12.92, 13.04, 13.20, 13.44, 13.68, 13.84, 14.00, 14.16, 14.28, 14.40, 14.56, 14.72, 14.76, 15.00, 15.28, 15.48, 15.68, 15.96, 16.16, 16.20, 16.28]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.24, 0.40, 0.60, 0.80, 1.04, 1.16, 1.28, 1.36, 1.44, 1.48, 1.68, 1.76, 1.88, 2.00, 2.08, 2.24, 2.28, 2.48, 2.52, 2.80, 3.08, 3.28, 3.52, 3.68, 3.84, 3.96, 4.12, 4.20, 4.32, 4.44]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.106 s +Real time factor (RTF): 1.106 / 28.165 = 0.039 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-06-26.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-06-26.txt new file mode 100644 index 000000000..9ba6d8735 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-en-2023-06-26.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:108 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.56, 0.64, 0.80, 1.08, 1.36, 1.40, 1.52, 1.68, 1.84, 1.96, 2.04, 2.20, 2.32, 2.40, 2.48, 2.60, 2.80, 3.04, 3.28, 3.40, 3.56, 3.76, 4.08, 4.24, 4.32, 4.48, 4.64, 4.80, 4.84, 5.00, 5.04, 5.28, 5.40, 5.56, 5.60, 5.76, 5.96, 6.12]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00, 0.24, 0.56, 0.76, 0.92, 1.04, 1.16, 1.20, 1.36, 1.52, 1.64, 1.80, 1.88, 2.00, 2.16, 2.32, 2.40, 2.64, 2.88, 3.12, 3.24, 3.48, 3.56, 3.72, 3.92, 4.12, 4.40, 4.52, 4.72, 4.96, 5.16, 5.36, 5.64, 6.12, 6.28, 6.52, 6.88, 7.12, 7.32, 7.56, 7.92, 8.16, 8.28, 8.40, 8.48, 8.64, 8.76, 8.88, 9.04, 9.28, 9.44, 9.52, 9.60, 9.72, 9.92, 9.96, 10.16, 10.48, 10.72, 10.80, 11.04, 11.20, 11.36, 11.56, 11.76, 12.00, 12.12, 12.28, 12.32, 12.52, 12.72, 12.84, 12.92, 13.04, 13.20, 13.44, 13.68, 13.84, 14.00, 14.16, 14.28, 14.40, 14.56, 14.72, 14.76, 15.00, 15.28, 15.48, 15.68, 15.96, 16.16, 16.20, 16.28]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.24, 0.40, 0.60, 0.80, 1.04, 1.16, 1.28, 1.36, 1.44, 1.48, 1.68, 1.76, 1.88, 2.00, 2.12, 2.24, 2.28, 2.48, 2.52, 2.80, 3.08, 3.28, 3.52, 3.68, 3.84, 3.96, 4.12, 4.20, 4.32, 4.44]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.301 s +Real time factor (RTF): 1.301 / 28.165 = 0.046 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt new file mode 100644 index 000000000..a14dfa330 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav +{"text": " AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS", "timestamps": [0.00, 0.36, 0.52, 0.68, 0.96, 1.00, 1.08, 1.28, 1.40, 1.48, 1.60, 1.76, 1.80, 1.88, 1.92, 2.00, 2.20, 2.32, 2.36, 2.48, 2.60, 2.80, 2.84, 2.92, 3.12, 3.32, 3.56, 3.76, 4.04, 4.24, 4.32, 4.40, 4.56, 4.80, 4.92, 5.08, 5.36, 5.48, 5.64, 5.72, 5.88, 6.04, 6.24], "tokens":[" AFTER", " E", "AR", "LY", " ", "N", "IGHT", "F", "AL", "L", " THE", " ", "Y", "E", "LL", "OW", " LA", "M", "P", "S", " WOULD", " ", "L", "IGHT", " UP", " HERE", " AND", " THERE", " THE", " S", "QU", "AL", "ID", " QU", "AR", "TER", " OF", " THE", " B", "RO", "TH", "EL", "S"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav +{"text": " GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN", "timestamps": [0.00, 0.16, 0.40, 0.68, 0.84, 0.96, 1.08, 1.12, 1.32, 1.52, 1.68, 1.76, 2.00, 2.12, 2.28, 2.40, 2.64, 2.92, 3.20, 3.32, 3.52, 3.64, 3.76, 3.96, 4.12, 4.36, 4.52, 4.72, 4.92, 5.16, 5.40, 5.64, 5.76, 5.88, 6.12, 6.28, 6.52, 6.84, 7.08, 7.32, 7.60, 7.92, 8.12, 8.24, 8.36, 8.48, 8.64, 8.76, 8.88, 9.12, 9.32, 9.48, 9.56, 9.60, 9.76, 10.00, 10.12, 10.20, 10.44, 10.68, 10.80, 11.00, 11.20, 11.36, 11.52, 11.76, 12.00, 12.12, 12.24, 12.28, 12.52, 12.72, 12.84, 12.96, 13.04, 13.24, 13.44, 13.64, 13.76, 14.00, 14.08, 14.24, 14.52, 14.68, 14.80, 15.00, 15.04, 15.28, 15.48, 15.76, 16.00, 16.12, 16.16, 16.32], "tokens":[" GO", "D", " AS", " A", " DI", "RE", "C", "T", " CON", "SE", "QU", "ENCE", " OF", " THE", " S", "IN", " WHICH", " MAN", " TH", "US", " P", "UN", "ISH", "ED", " HAD", " GIVE", "N", " HER", " A", " LOVE", "LY", " CHI", "L", "D", " WHO", "SE", " PLACE", " WAS", " ON", " THAT", " SAME", " DIS", "HO", "N", "OR", "ED", " BO", "S", "OM", " TO", " CON", "NE", "C", "T", " HER", " PA", "R", "ENT", " FOR", " E", "VER", " WITH", " THE", " RA", "CE", " AND", " DE", "S", "C", "ENT", " OF", " MO", "R", "T", "AL", "S", " AND", " TO", " BE", " F", "IN", "ALLY", " A", " B", "LES", "S", "ED", " SO", "UL", " IN", " HE", "A", "VE", "N"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav +{"text": " YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION", "timestamps": [0.00, 0.04, 0.12, 0.40, 0.68, 0.88, 0.96, 1.12, 1.24, 1.32, 1.44, 1.48, 1.64, 1.76, 1.88, 2.04, 2.16, 2.28, 2.32, 2.52, 2.68, 2.72, 2.88, 3.12, 3.32, 3.52, 3.80, 4.00, 4.16, 4.24, 4.40, 4.48], "tokens":[" ", "Y", "ET", " THESE", " THOUGH", "T", "S", " A", "FF", "E", "C", "TED", " HE", "S", "TER", " P", "RY", "N", "NE", " LE", "S", "S", " WITH", " HO", "PE", " THAN", " APP", "RE", "HE", "N", "S", "ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.101 s +Real time factor (RTF): 1.101 / 28.165 = 0.039 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt new file mode 100644 index 000000000..dc01d36e0 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.onnx --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav +{"text": " AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS", "timestamps": [0.00, 0.36, 0.52, 0.68, 0.96, 1.00, 1.08, 1.28, 1.40, 1.48, 1.60, 1.76, 1.80, 1.88, 1.92, 2.00, 2.20, 2.32, 2.36, 2.48, 2.60, 2.80, 2.84, 2.92, 3.12, 3.32, 3.56, 3.76, 4.04, 4.20, 4.32, 4.40, 4.56, 4.80, 4.92, 5.08, 5.36, 5.48, 5.64, 5.72, 5.88, 6.04, 6.24], "tokens":[" AFTER", " E", "AR", "LY", " ", "N", "IGHT", "F", "AL", "L", " THE", " ", "Y", "E", "LL", "OW", " LA", "M", "P", "S", " WOULD", " ", "L", "IGHT", " UP", " HERE", " AND", " THERE", " THE", " S", "QU", "AL", "ID", " QU", "AR", "TER", " OF", " THE", " B", "RO", "TH", "EL", "S"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav +{"text": " GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN", "timestamps": [0.00, 0.16, 0.40, 0.68, 0.84, 0.96, 1.04, 1.12, 1.32, 1.52, 1.68, 1.76, 2.00, 2.12, 2.28, 2.40, 2.64, 2.92, 3.20, 3.32, 3.52, 3.64, 3.76, 3.96, 4.12, 4.36, 4.52, 4.72, 4.92, 5.16, 5.40, 5.64, 5.76, 5.88, 6.12, 6.28, 6.48, 6.84, 7.08, 7.32, 7.60, 7.92, 8.12, 8.24, 8.36, 8.48, 8.64, 8.76, 8.88, 9.12, 9.32, 9.48, 9.56, 9.60, 9.76, 10.00, 10.12, 10.20, 10.44, 10.68, 10.80, 11.00, 11.20, 11.36, 11.52, 11.76, 12.00, 12.12, 12.24, 12.28, 12.52, 12.72, 12.84, 12.96, 13.04, 13.24, 13.40, 13.64, 13.76, 14.00, 14.08, 14.24, 14.52, 14.68, 14.80, 15.00, 15.04, 15.28, 15.52, 15.76, 16.00, 16.12, 16.20, 16.32], "tokens":[" GO", "D", " AS", " A", " DI", "RE", "C", "T", " CON", "SE", "QU", "ENCE", " OF", " THE", " S", "IN", " WHICH", " MAN", " TH", "US", " P", "UN", "ISH", "ED", " HAD", " GIVE", "N", " HER", " A", " LOVE", "LY", " CHI", "L", "D", " WHO", "SE", " PLACE", " WAS", " ON", " THAT", " SAME", " DIS", "HO", "N", "OR", "ED", " BO", "S", "OM", " TO", " CON", "NE", "C", "T", " HER", " PA", "R", "ENT", " FOR", " E", "VER", " WITH", " THE", " RA", "CE", " AND", " DE", "S", "C", "ENT", " OF", " MO", "R", "T", "AL", "S", " AND", " TO", " BE", " F", "IN", "ALLY", " A", " B", "LES", "S", "ED", " SO", "UL", " IN", " HE", "A", "VE", "N"]} +---- +./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav +{"text": " YET THESE THOUGHTS AFFECTED HESTER PRYNE LESS WITH HOPE THAN APPREHENSION", "timestamps": [0.00, 0.04, 0.12, 0.40, 0.68, 0.88, 0.96, 1.12, 1.20, 1.32, 1.44, 1.48, 1.64, 1.76, 1.88, 2.04, 2.16, 2.28, 2.52, 2.68, 2.72, 2.88, 3.12, 3.28, 3.52, 3.80, 4.00, 4.16, 4.24, 4.40, 4.48], "tokens":[" ", "Y", "ET", " THESE", " THOUGH", "T", "S", " A", "FF", "E", "C", "TED", " HE", "S", "TER", " P", "RY", "NE", " LE", "S", "S", " WITH", " HO", "PE", " THAN", " APP", "RE", "HE", "N", "S", "ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.407 s +Real time factor (RTF): 1.407 / 28.165 = 0.050 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-int8.txt new file mode 100644 index 000000000..2f93cf148 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt --encoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.int8.onnx --num-threads=1 ./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/test_wavs/1.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/test_wavs/1.wav +{"text": "気象庁は雪や路面の凍結による交通への影響暴風雪や高波に警戒するとともに雪崩や屋根からの落雪にも十分注意するよう呼びかけています", "timestamps": [0.00, 0.48, 0.64, 0.88, 1.24, 1.44, 1.80, 2.00, 2.12, 2.40, 2.56, 2.80, 2.96, 3.04, 3.44, 3.60, 3.88, 4.00, 4.28, 4.40, 4.76, 4.96, 5.20, 5.40, 5.72, 5.92, 6.20, 6.48, 6.64, 6.88, 6.96, 7.08, 7.28, 7.48, 7.64, 8.00, 8.16, 8.36, 8.68, 8.80, 9.04, 9.12, 9.28, 9.64, 9.80, 10.00, 10.16, 10.44, 10.64, 10.92, 11.04, 11.24, 11.36, 11.52, 11.60, 11.88, 11.92, 12.16, 12.28, 12.44, 12.64, 13.16, 13.20], "tokens":["気", "象", "庁", "は", "雪", "や", "路", "面", "の", "凍", "結", "に", "よ", "る", "交", "通", "へ", "の", "影", "響", "暴", "風", "雪", "や", "高", "波", "に", "警", "戒", "す", "る", "と", "と", "も", "に", "雪", "崩", "や", "屋", "根", "か", "ら", "の", "落", "雪", "に", "も", "十", "分", "注", "意", "す", "る", "よ", "う", "呼", "び", "か", "け", "て", "い", "ま", "す"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.719 s +Real time factor (RTF): 0.719 / 13.433 = 0.054 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.txt new file mode 100644 index 000000000..26555e7a0 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt --encoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.onnx --num-threads=1 ./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/test_wavs/1.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/test_wavs/1.wav +{"text": "気象庁は雪や路面の凍結による交通への影響暴風雪や高波に警戒するとともに雪崩や屋根からの落雪にも十分注意するよう呼びかけています", "timestamps": [0.00, 0.48, 0.64, 0.88, 1.24, 1.44, 1.80, 2.00, 2.12, 2.40, 2.56, 2.80, 2.96, 3.04, 3.44, 3.60, 3.88, 4.00, 4.28, 4.40, 4.76, 4.96, 5.20, 5.40, 5.72, 5.92, 6.16, 6.48, 6.64, 6.88, 6.96, 7.08, 7.28, 7.48, 7.64, 8.00, 8.16, 8.36, 8.68, 8.80, 9.04, 9.12, 9.28, 9.64, 9.80, 10.00, 10.16, 10.44, 10.64, 10.92, 11.04, 11.24, 11.36, 11.52, 11.64, 11.88, 11.92, 12.16, 12.28, 12.44, 12.64, 13.16, 13.20], "tokens":["気", "象", "庁", "は", "雪", "や", "路", "面", "の", "凍", "結", "に", "よ", "る", "交", "通", "へ", "の", "影", "響", "暴", "風", "雪", "や", "高", "波", "に", "警", "戒", "す", "る", "と", "と", "も", "に", "雪", "崩", "や", "屋", "根", "か", "ら", "の", "落", "雪", "に", "も", "十", "分", "注", "意", "す", "る", "よ", "う", "呼", "び", "か", "け", "て", "い", "ま", "す"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 1.101 s +Real time factor (RTF): 1.101 / 13.433 = 0.082 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24-int8.txt new file mode 100644 index 000000000..2982fa1cd --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24-int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:360 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt --encoder=./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-zipformer-korean-2024-06-24/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-korean-2024-06-24/test_wavs/0.wav +{"text": " 그는 괜찮은 척하려고 애쓰는 것 같았다.", "timestamps": [0.12, 0.24, 0.56, 1.00, 1.20, 1.32, 2.00, 2.16, 2.32, 2.52, 2.68, 2.84, 3.08, 3.28], "tokens":[" 그", "는", " 괜찮은", " 척", "하", "려고", " 애", "쓰", "는", " 것", " 같", "았", "다", "."], "words": []} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.092 s +Real time factor (RTF): 0.092 / 3.526 = 0.026 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24.txt new file mode 100644 index 000000000..69e3456e1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:360 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt --encoder=./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-zipformer-korean-2024-06-24/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-korean-2024-06-24/test_wavs/0.wav +{"text": " 그는 괜찮은 척하려고 애쓰는 것 같았다.", "timestamps": [0.12, 0.24, 0.56, 1.00, 1.20, 1.32, 2.00, 2.16, 2.32, 2.52, 2.68, 2.80, 3.08, 3.28], "tokens":[" 그", "는", " 괜찮은", " 척", "하", "려고", " 애", "쓰", "는", " 것", " 같", "았", "다", "."], "words": []} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.119 s +Real time factor (RTF): 0.119 / 3.526 = 0.034 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26-int8.txt new file mode 100644 index 000000000..eb50e79d7 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26-int8.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/0.wav ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/1.wav ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:108 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/0.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.48, 0.60, 0.72, 1.04, 1.28, 1.36, 1.48, 1.60, 1.84, 1.96, 2.00, 2.16, 2.32, 2.40, 2.48, 2.60, 2.80, 3.04, 3.28, 3.40, 3.56, 3.76, 4.04, 4.24, 4.28, 4.48, 4.64, 4.80, 4.84, 5.00, 5.04, 5.28, 5.40, 5.56, 5.60, 5.76, 5.96, 6.16]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/1.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00, 0.20, 0.48, 0.72, 0.88, 1.04, 1.12, 1.20, 1.36, 1.52, 1.64, 1.84, 1.88, 2.00, 2.12, 2.32, 2.36, 2.60, 2.84, 3.12, 3.24, 3.48, 3.56, 3.76, 3.92, 4.12, 4.36, 4.52, 4.72, 4.96, 5.16, 5.44, 5.68, 6.12, 6.28, 6.48, 6.88, 7.12, 7.36, 7.56, 7.92, 8.16, 8.28, 8.40, 8.48, 8.60, 8.76, 8.88, 9.08, 9.28, 9.44, 9.52, 9.60, 9.72, 9.92, 10.00, 10.12, 10.48, 10.68, 10.76, 11.00, 11.20, 11.36, 11.56, 11.76, 12.00, 12.12, 12.28, 12.32, 12.52, 12.72, 12.84, 12.92, 13.04, 13.20, 13.44, 13.64, 13.76, 14.00, 14.08, 14.24, 14.36, 14.52, 14.72, 14.76, 15.04, 15.28, 15.52, 15.76, 16.00, 16.20, 16.24, 16.32]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/8k.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.12, 0.36, 0.48, 0.76, 0.96, 1.12, 1.24, 1.32, 1.44, 1.48, 1.68, 1.76, 1.88, 2.04, 2.12, 2.28, 2.32, 2.48, 2.52, 2.80, 3.08, 3.28, 3.52, 3.76, 3.92, 4.00, 4.16, 4.24, 4.36, 4.44]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.490 s +Real time factor (RTF): 1.490 / 28.165 = 0.053 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26.txt new file mode 100644 index 000000000..624ca7416 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/0.wav ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/1.wav ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:108 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/0.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.48, 0.60, 0.72, 1.04, 1.28, 1.36, 1.48, 1.60, 1.84, 1.96, 2.00, 2.16, 2.32, 2.40, 2.48, 2.60, 2.80, 3.04, 3.28, 3.40, 3.56, 3.76, 4.04, 4.24, 4.28, 4.48, 4.64, 4.80, 4.84, 5.00, 5.04, 5.28, 5.40, 5.56, 5.60, 5.76, 5.96, 6.16]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/1.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00, 0.20, 0.48, 0.72, 0.88, 1.04, 1.12, 1.20, 1.36, 1.52, 1.68, 1.84, 1.88, 2.00, 2.12, 2.32, 2.36, 2.60, 2.84, 3.12, 3.24, 3.48, 3.56, 3.76, 3.92, 4.12, 4.36, 4.56, 4.72, 4.96, 5.16, 5.44, 5.68, 6.12, 6.28, 6.48, 6.88, 7.12, 7.36, 7.56, 7.92, 8.16, 8.28, 8.40, 8.48, 8.60, 8.76, 8.88, 9.08, 9.28, 9.44, 9.52, 9.60, 9.72, 9.92, 10.00, 10.12, 10.48, 10.68, 10.76, 11.00, 11.20, 11.36, 11.56, 11.76, 12.00, 12.12, 12.28, 12.32, 12.52, 12.72, 12.84, 12.92, 13.04, 13.20, 13.44, 13.64, 13.76, 14.00, 14.12, 14.24, 14.36, 14.52, 14.72, 14.80, 15.04, 15.28, 15.52, 15.76, 16.00, 16.20, 16.24, 16.32]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/8k.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.12, 0.36, 0.48, 0.76, 0.96, 1.12, 1.24, 1.32, 1.44, 1.48, 1.68, 1.76, 1.88, 2.04, 2.12, 2.24, 2.28, 2.48, 2.56, 2.80, 3.08, 3.28, 3.52, 3.80, 3.92, 4.00, 4.16, 4.24, 4.36, 4.44]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.843 s +Real time factor (RTF): 1.843 / 28.165 = 0.065 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04-int8.txt new file mode 100644 index 000000000..4ff35f788 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04-int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt --encoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.int8.onnx --decoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx --joiner=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.int8.onnx ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1089-134686-0001.wav ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0001.wav ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.int8.onnx", decoder_filename="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx", joiner_filename="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4) +Creating recognizer ... +Started +Done! + +./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00,0.40,0.56,0.64,0.96,1.24,1.32,1.44,1.56,1.76,1.88,1.96,2.16,2.32,2.36,2.48,2.60,2.80,3.08,3.28,3.36,3.56,3.80,4.04,4.24,4.32,4.48,4.64,4.84,4.88,5.00,5.08,5.32,5.44,5.56,5.64,5.80,5.96,6.20]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00,0.12,0.44,0.68,0.80,1.00,1.12,1.16,1.32,1.48,1.64,1.80,1.84,2.00,2.12,2.28,2.40,2.64,2.88,3.16,3.28,3.56,3.60,3.76,3.92,4.12,4.36,4.52,4.72,4.92,5.16,5.44,5.72,6.12,6.24,6.48,6.84,7.08,7.28,7.56,7.88,8.12,8.28,8.36,8.48,8.60,8.76,8.88,9.12,9.28,9.48,9.56,9.64,9.80,10.00,10.04,10.16,10.44,10.68,10.80,11.04,11.20,11.40,11.56,11.80,12.00,12.16,12.28,12.32,12.52,12.72,12.84,12.96,13.04,13.24,13.40,13.64,13.80,14.00,14.16,14.24,14.36,14.56,14.72,14.80,15.08,15.32,15.52,15.76,16.04,16.16,16.24,16.36]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00,0.08,0.32,0.48,0.68,0.92,1.08,1.20,1.28,1.40,1.44,1.64,1.76,1.88,2.04,2.12,2.28,2.32,2.52,2.56,2.88,3.12,3.32,3.52,3.76,3.92,4.00,4.20,4.28,4.40,4.52]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.424 s +Real time factor (RTF): 1.424 / 28.165 = 0.051 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04.txt new file mode 100644 index 000000000..6a9415f26 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt --encoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.onnx --decoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx --joiner=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.onnx ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1089-134686-0001.wav ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0001.wav ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0002.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.onnx", decoder_filename="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx", joiner_filename="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4) +Creating recognizer ... +Started +Done! + +./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00,0.40,0.56,0.64,0.96,1.24,1.32,1.44,1.56,1.76,1.88,1.96,2.16,2.32,2.36,2.48,2.60,2.80,3.08,3.28,3.36,3.56,3.80,4.04,4.24,4.32,4.48,4.64,4.84,4.88,5.00,5.08,5.32,5.44,5.56,5.64,5.80,5.96,6.20]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00,0.16,0.44,0.68,0.84,1.00,1.12,1.16,1.32,1.48,1.64,1.80,1.84,2.00,2.12,2.28,2.40,2.64,2.88,3.16,3.28,3.56,3.60,3.76,3.92,4.12,4.36,4.52,4.72,4.92,5.16,5.44,5.72,6.12,6.24,6.48,6.84,7.08,7.28,7.56,7.88,8.12,8.28,8.36,8.48,8.60,8.76,8.88,9.12,9.28,9.48,9.56,9.64,9.80,10.00,10.04,10.20,10.44,10.68,10.80,11.04,11.20,11.40,11.56,11.80,12.00,12.12,12.28,12.32,12.52,12.72,12.84,12.96,13.04,13.24,13.40,13.64,13.80,14.00,14.16,14.24,14.36,14.56,14.72,14.80,15.08,15.32,15.52,15.76,16.04,16.16,16.24,16.36]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00,0.08,0.32,0.48,0.68,0.92,1.08,1.20,1.28,1.40,1.44,1.64,1.76,1.88,2.04,2.12,2.24,2.32,2.48,2.56,2.88,3.12,3.32,3.52,3.76,3.92,4.00,4.20,4.28,4.40,4.52]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.662 s +Real time factor (RTF): 1.662 / 28.165 = 0.059 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2-int8.txt new file mode 100644 index 000000000..aaff03f20 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2-int8.txt @@ -0,0 +1,24 @@ +/Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt --encoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-20-avg-1.onnx --joiner=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.int8.onnx ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/0.wav ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-20-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe"), tdnn=OfflineTdnnModelConfig(model=""), tokens="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5) +Creating recognizer ... +Started +/Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:117 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/0.wav +{"text":" 对我做了介绍那么我想说的是大家如果对我的研究感兴趣","timestamps":"[0.00, 0.16, 0.40, 0.60, 0.84, 1.08, 1.60, 1.72, 1.88, 2.04, 2.28, 2.44, 2.60, 2.96, 3.12, 3.32, 3.40, 3.60, 3.76, 3.84, 4.00, 4.16, 4.32, 4.52, 4.56]","tokens":[" 对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣"]} +---- +./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav +{"text":" 重点想谈三个问题首先就是这一轮全球金融动<0xE8><0x8D><0xA1>的表现","timestamps":"[0.00, 0.12, 0.48, 0.68, 0.92, 1.12, 1.28, 1.48, 1.80, 2.04, 2.40, 2.56, 2.76, 2.96, 3.08, 3.32, 3.48, 3.68, 3.84, 4.00, 4.20, 4.24, 4.28, 4.40, 4.60, 4.84]","tokens":[" 重","点","想","谈","三","个","问","题","首","先","就","是","这","一","轮","全","球","金","融","动","<0xE8>","<0x8D>","<0xA1>","的","表","现"]} +---- +./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/8k.wav +{"text":" 深入地分析这一次全球金融动<0xE8><0x8D><0xA1>背后的根源","timestamps":"[0.00, 0.04, 0.24, 0.52, 0.76, 1.00, 1.40, 1.64, 1.80, 2.12, 2.36, 2.64, 2.80, 3.04, 3.16, 3.20, 3.24, 3.44, 3.64, 3.76, 3.96, 4.20]","tokens":[" ","深","入","地","分","析","这","一","次","全","球","金","融","动","<0xE8>","<0x8D>","<0xA1>","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.305 s +Real time factor (RTF): 0.305 / 15.289 = 0.020 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.txt new file mode 100644 index 000000000..23d13da07 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.txt @@ -0,0 +1,24 @@ +/Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt --encoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.onnx --decoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-20-avg-1.onnx --joiner=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.onnx ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/0.wav ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-20-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe"), tdnn=OfflineTdnnModelConfig(model=""), tokens="./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5) +Creating recognizer ... +Started +/Users/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:117 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/0.wav +{"text":" 对我做了介绍那么我想说的是大家如果对我的研究感兴趣","timestamps":"[0.00, 0.16, 0.40, 0.60, 0.84, 1.08, 1.60, 1.72, 1.88, 2.04, 2.24, 2.44, 2.60, 2.96, 3.12, 3.32, 3.40, 3.60, 3.72, 3.84, 4.00, 4.16, 4.32, 4.52, 4.68]","tokens":[" 对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣"]} +---- +./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav +{"text":" 重点想谈三个问题首先就是这一轮全球金融动<0xE8><0x8D><0xA1>的表现","timestamps":"[0.00, 0.12, 0.48, 0.68, 0.92, 1.12, 1.28, 1.48, 1.80, 2.04, 2.40, 2.56, 2.76, 2.96, 3.08, 3.32, 3.48, 3.68, 3.84, 4.00, 4.20, 4.24, 4.28, 4.40, 4.60, 4.84]","tokens":[" 重","点","想","谈","三","个","问","题","首","先","就","是","这","一","轮","全","球","金","融","动","<0xE8>","<0x8D>","<0xA1>","的","表","现"]} +---- +./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/8k.wav +{"text":" 深入地分析这一次全球金融动<0xE8><0x8D><0xA1>背后的根源","timestamps":"[0.00, 0.04, 0.24, 0.52, 0.76, 1.00, 1.40, 1.64, 1.80, 2.12, 2.32, 2.64, 2.80, 3.00, 3.20, 3.24, 3.28, 3.44, 3.64, 3.76, 3.96, 4.20]","tokens":[" ","深","入","地","分","析","这","一","次","全","球","金","融","动","<0xE8>","<0x8D>","<0xA1>","背","后","的","根","源"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.362 s +Real time factor (RTF): 0.362 / 15.289 = 0.024 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.int8.txt new file mode 100644 index 000000000..9384d0812 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt --encoder=./sherpa-onnx-zipformer-ru-2024-09-18/encoder.int8.onnx --decoder=./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx --joiner=./sherpa-onnx-zipformer-ru-2024-09-18/joiner.int8.onnx --num-threads=1 ./sherpa-onnx-zipformer-ru-2024-09-18/test_wavs/1.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-ru-2024-09-18/encoder.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx", joiner_filename="./sherpa-onnx-zipformer-ru-2024-09-18/joiner.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-ru-2024-09-18/test_wavs/1.wav +{"lang": "", "emotion": "", "event": "", "text": " родион потапыч высчитывал каждый новый вершок углубления и давно определил про себя", "timestamps": [0.00, 0.16, 0.28, 0.52, 0.68, 0.84, 0.96, 1.12, 1.44, 1.64, 1.76, 1.92, 2.08, 2.16, 2.36, 2.52, 2.60, 2.80, 2.96, 3.04, 3.20, 3.40, 3.44, 3.60, 3.68, 3.80, 3.88, 4.00, 4.16, 4.20, 4.68, 4.88, 5.08, 5.20, 5.44, 5.64, 5.68, 5.88, 6.32, 6.56], "tokens":[" ро", "ди", "он", " по", "та", "п", "ы", "ч", " вы", "с", "чи", "ты", "ва", "л", " ка", "жд", "ый", " но", "в", "ый", " вер", "ш", "о", "к", " у", "г", "лу", "б", "л", "ения", " и", " да", "в", "но", " оп", "ре", "дел", "ил", " про", " себя"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.280 s +Real time factor (RTF): 0.280 / 7.080 = 0.040 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.txt new file mode 100644 index 000000000..ac4c6908d --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt --encoder=./sherpa-onnx-zipformer-ru-2024-09-18/encoder.onnx --decoder=./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx --joiner=./sherpa-onnx-zipformer-ru-2024-09-18/joiner.onnx --num-threads=1 ./sherpa-onnx-zipformer-ru-2024-09-18/test_wavs/1.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-ru-2024-09-18/encoder.onnx", decoder_filename="./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx", joiner_filename="./sherpa-onnx-zipformer-ru-2024-09-18/joiner.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-ru-2024-09-18/test_wavs/1.wav +{"lang": "", "emotion": "", "event": "", "text": " родион потапыч высчитывал каждый новый вершок углубления и давно определил про себя", "timestamps": [0.00, 0.16, 0.28, 0.52, 0.68, 0.84, 0.96, 1.12, 1.44, 1.64, 1.76, 1.92, 2.08, 2.16, 2.36, 2.48, 2.60, 2.80, 2.96, 3.04, 3.20, 3.40, 3.44, 3.56, 3.68, 3.80, 3.88, 4.00, 4.16, 4.20, 4.64, 4.88, 5.08, 5.20, 5.44, 5.64, 5.68, 5.92, 6.32, 6.56], "tokens":[" ро", "ди", "он", " по", "та", "п", "ы", "ч", " вы", "с", "чи", "ты", "ва", "л", " ка", "жд", "ый", " но", "в", "ый", " вер", "ш", "о", "к", " у", "г", "лу", "б", "л", "ения", " и", " да", "в", "но", " оп", "ре", "дел", "ил", " про", " себя"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.336 s +Real time factor (RTF): 0.336 / 7.080 = 0.047 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26-int8.txt new file mode 100644 index 000000000..fa9dde9bb --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26-int8.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/1.wav ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:108 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.64, 0.76, 0.84, 1.08, 1.36, 1.44, 1.56, 1.72, 1.84, 1.96, 2.04, 2.20, 2.32, 2.36, 2.44, 2.60, 2.76, 3.04, 3.24, 3.40, 3.52, 3.72, 4.00, 4.20, 4.28, 4.48, 4.64, 4.80, 4.84, 4.96, 5.00, 5.28, 5.40, 5.52, 5.60, 5.76, 5.92, 6.08]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/1.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00, 0.32, 0.64, 0.80, 0.96, 1.08, 1.16, 1.20, 1.32, 1.52, 1.68, 1.80, 1.88, 2.04, 2.16, 2.32, 2.40, 2.64, 2.88, 3.16, 3.20, 3.44, 3.52, 3.72, 3.88, 4.16, 4.44, 4.60, 4.76, 4.96, 5.16, 5.36, 5.60, 6.16, 6.32, 6.52, 6.88, 7.16, 7.32, 7.60, 7.96, 8.16, 8.28, 8.36, 8.48, 8.64, 8.76, 8.84, 9.04, 9.28, 9.44, 9.52, 9.60, 9.68, 9.88, 9.92, 10.12, 10.52, 10.76, 10.80, 11.08, 11.20, 11.36, 11.56, 11.76, 11.96, 12.08, 12.24, 12.28, 12.48, 12.68, 12.80, 12.92, 13.04, 13.16, 13.48, 13.72, 13.84, 14.04, 14.20, 14.28, 14.40, 14.56, 14.68, 14.76, 15.00, 15.28, 15.48, 15.68, 15.92, 16.08, 16.12, 16.20]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/8k.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.32, 0.48, 0.64, 0.84, 1.08, 1.20, 1.32, 1.36, 1.44, 1.48, 1.64, 1.76, 1.88, 2.08, 2.12, 2.24, 2.28, 2.44, 2.48, 2.80, 3.04, 3.24, 3.48, 3.72, 3.88, 3.92, 4.08, 4.16, 4.24, 4.36]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.891 s +Real time factor (RTF): 0.891 / 28.165 = 0.032 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26.txt new file mode 100644 index 000000000..5db0cf311 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26.txt @@ -0,0 +1,24 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/1.wav ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/8k.wav + +OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), tokens="./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt", num_threads=2, debug=False, provider="cpu"), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) +Creating recognizer ... +Started +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/offline-stream.cc:AcceptWaveformImpl:108 Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Done! + +./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00, 0.64, 0.76, 0.84, 1.12, 1.36, 1.44, 1.56, 1.72, 1.84, 1.96, 2.04, 2.20, 2.32, 2.36, 2.44, 2.60, 2.76, 3.04, 3.24, 3.40, 3.52, 3.72, 4.04, 4.20, 4.28, 4.48, 4.64, 4.80, 4.84, 4.96, 5.00, 5.28, 5.40, 5.52, 5.60, 5.76, 5.92, 6.08]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +---- +./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/1.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00, 0.32, 0.64, 0.80, 0.96, 1.08, 1.16, 1.20, 1.32, 1.52, 1.68, 1.80, 1.88, 2.04, 2.16, 2.32, 2.40, 2.64, 2.88, 3.16, 3.20, 3.44, 3.52, 3.72, 3.88, 4.16, 4.44, 4.60, 4.76, 4.96, 5.16, 5.36, 5.60, 6.16, 6.32, 6.52, 6.88, 7.16, 7.32, 7.60, 7.96, 8.16, 8.28, 8.36, 8.48, 8.64, 8.76, 8.84, 9.04, 9.28, 9.44, 9.52, 9.60, 9.68, 9.88, 9.92, 10.12, 10.52, 10.76, 10.80, 11.08, 11.20, 11.36, 11.56, 11.76, 11.96, 12.08, 12.24, 12.28, 12.48, 12.68, 12.80, 12.92, 13.00, 13.20, 13.48, 13.72, 13.84, 14.04, 14.20, 14.28, 14.40, 14.56, 14.68, 14.76, 15.00, 15.24, 15.48, 15.68, 15.92, 16.08, 16.12, 16.20]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR","E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +---- +./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/8k.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00, 0.32, 0.48, 0.64, 0.84, 1.08, 1.20, 1.32, 1.36, 1.44, 1.48, 1.64, 1.76, 1.88, 2.08, 2.12, 2.24, 2.28, 2.44, 2.48, 2.80, 3.04, 3.24, 3.48, 3.72, 3.88, 3.92, 4.08, 4.16, 4.24, 4.36]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.953 s +Real time factor (RTF): 0.953 / 28.165 = 0.034 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20-int8.txt new file mode 100644 index 000000000..ead722fea --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20-int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:360 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt --encoder=./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.int8.onnx --decoder=./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx --joiner=./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.int8.onnx ./sherpa-onnx-zipformer-thai-2024-06-20/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx", joiner_filename="./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-thai-2024-06-20/test_wavs/0.wav +{"text": " เดี๋ยวเกมในนัดต่อไปต้องไปเจอกับทางอินโดนีเซียนะครับ", "timestamps": [0.00, 0.24, 0.44, 0.64, 0.84, 1.20, 1.84, 2.32, 2.64, 3.12, 3.64, 3.80, 3.88, 4.28], "tokens":[" เดี๋ยว", "เกม", "ใน", "นัด", "ต่อไป", "ต้อง", "ไปเจอ", "กับ", "ทาง", "อิน", "โดน", "ี", "เซีย", "นะครับ"], "words": []} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.150 s +Real time factor (RTF): 0.150 / 4.496 = 0.033 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20.txt new file mode 100644 index 000000000..e4d893cae --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:360 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt --encoder=./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.onnx --decoder=./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx --joiner=./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.onnx ./sherpa-onnx-zipformer-thai-2024-06-20/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.onnx", decoder_filename="./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx", joiner_filename="./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-thai-2024-06-20/test_wavs/0.wav +{"text": " แต่เดี๋ยวเกมในนัดต่อไปต้องไปเจอกับทางอินโดนีเซียอะไรอย่างนี้", "timestamps": [0.00, 0.08, 0.24, 0.44, 0.64, 0.84, 1.20, 1.84, 2.32, 2.64, 3.12, 3.64, 3.80, 3.88, 4.28], "tokens":[" แต่", "เดี๋ยว", "เกม", "ใน", "นัด", "ต่อไป", "ต้อง", "ไปเจอ", "กับ", "ทาง", "อิน", "โดน", "ี", "เซีย", "อะไรอย่างนี้"], "words": []} +---- +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.181 s +Real time factor (RTF): 0.181 / 4.496 = 0.040 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22-int8.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22-int8.txt new file mode 100644 index 000000000..d5fd28da1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22-int8.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt --encoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.int8.onnx --decoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx --joiner=./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.int8.onnx --num-threads=1 ./sherpa-onnx-zipformer-zh-en-2023-11-22/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.int8.onnx", decoder_filename="./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx", joiner_filename="./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.int8.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), moonshine=OfflineMoonshineModelConfig(preprocessor="", encoder="", uncached_decoder="", cached_decoder=""), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-zh-en-2023-11-22/test_wavs/0.wav +{"lang": "", "emotion": "", "event": "", "text": "SPEND ON 在什么上面花费它不光是时间也可以是金钱", "timestamps": [0.00, 0.04, 0.32, 0.48, 0.64, 0.72, 0.88, 1.00, 1.24, 1.36, 1.60, 1.80, 1.96, 2.12, 2.32, 2.44, 2.60, 2.72, 2.80, 2.88, 3.00, 3.20], "tokens":["▁SP", "END", "▁ON", "▁ƌŁŎ", "▁Ƌšġ", "▁Ƌşĩ", "▁ƋŞī", "▁ƐłŇ", "▁Əīŗ", "▁ƏŚş", "▁ƌŔĤ", "▁ƋŞĮ", "▁ƌĦĪ", "▁ƍĻŕ", "▁ƍĺŜ", "▁ƐĺŚ", "▁Ƌşń", "▁ƌİŕ", "▁Ƌšŋ", "▁ƍĻŕ", "▁ƐĨĴ", "▁Ɛĵŗ"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.162 s +Real time factor (RTF): 0.162 / 3.380 = 0.048 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22.txt b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22.txt new file mode 100644 index 000000000..1319ff7d2 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt --encoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.onnx --decoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx --joiner=./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.onnx --num-threads=1 ./sherpa-onnx-zipformer-zh-en-2023-11-22/test_wavs/0.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.onnx", decoder_filename="./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx", joiner_filename="./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.onnx"), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="", language="auto", use_itn=False), moonshine=OfflineMoonshineModelConfig(preprocessor="", encoder="", uncached_decoder="", cached_decoder=""), telespeech_ctc="", tokens="./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-zipformer-zh-en-2023-11-22/test_wavs/0.wav +{"lang": "", "emotion": "", "event": "", "text": "SPEND ON 在什么上面花费它不光是时间也可以是金钱", "timestamps": [0.00, 0.04, 0.32, 0.48, 0.64, 0.72, 0.88, 1.00, 1.24, 1.36, 1.60, 1.80, 1.96, 2.12, 2.32, 2.44, 2.60, 2.72, 2.80, 2.88, 3.00, 3.20], "tokens":["▁SP", "END", "▁ON", "▁ƌŁŎ", "▁Ƌšġ", "▁Ƌşĩ", "▁ƋŞī", "▁ƐłŇ", "▁Əīŗ", "▁ƏŚş", "▁ƌŔĤ", "▁ƋŞĮ", "▁ƌĦĪ", "▁ƍĻŕ", "▁ƍĺŜ", "▁ƐĺŚ", "▁Ƌşń", "▁ƌİŕ", "▁Ƌšŋ", "▁ƍĻŕ", "▁ƐĨĴ", "▁Ɛĵŗ"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.183 s +Real time factor (RTF): 0.183 / 3.380 = 0.054 diff --git a/docs/source/onnx/pretrained_models/offline-transducer/conformer-transducer-models.rst b/docs/source/onnx/pretrained_models/offline-transducer/conformer-transducer-models.rst new file mode 100644 index 000000000..e18373afc --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/conformer-transducer-models.rst @@ -0,0 +1,421 @@ +Conformer-transducer-based Models +================================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +csukuangfj/sherpa-onnx-conformer-zh-stateless2-2023-05-23 (Chinese) +------------------------------------------------------------------- + +This model is converted from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-conformer-zh-stateless2-2023-05-23.tar.bz2 + + tar xvf sherpa-onnx-conformer-zh-stateless2-2023-05-23.tar.bz2 + rm sherpa-onnx-conformer-zh-stateless2-2023-05-23.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-conformer-zh-stateless2-2023-05-23 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 11M May 23 15:29 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 12M May 23 15:29 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 122M May 23 15:30 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 315M May 23 15:31 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 2.7M May 23 15:29 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 11M May 23 15:29 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/0.wav \ + ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/1.wav \ + ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/2.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/0.wav \ + ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/1.wav \ + ./sherpa-onnx-conformer-zh-stateless2-2023-05-23/test_wavs/2.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + We did not use ``int8`` for the decoder model above. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-conformer-zh-stateless2-2023-05-23.int8.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-zh-stateless2-2023-05-23/joiner-epoch-99-avg-1.onnx + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + + +csukuangfj/sherpa-onnx-conformer-zh-2023-05-23 (Chinese) +-------------------------------------------------------- + +This model is converted from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-conformer-zh-2023-05-23.tar.bz2 + + tar xvf sherpa-onnx-conformer-zh-2023-05-23.tar.bz2 + rm sherpa-onnx-conformer-zh-2023-05-23.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-conformer-zh-2023-05-23 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 11M May 23 13:45 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 12M May 23 13:45 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 129M May 23 13:47 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 345M May 23 13:48 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 2.7M May 23 13:45 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 11M May 23 13:45 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/0.wav \ + ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/1.wav \ + ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/2.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-conformer-zh-2023-05-23.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/0.wav \ + ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/1.wav \ + ./sherpa-onnx-conformer-zh-2023-05-23/test_wavs/2.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + We did not use ``int8`` for the decoder model above. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-conformer-zh-2023-05-23.int8.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-conformer-zh-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +csukuangfj/sherpa-onnx-conformer-en-2023-03-18 (English) +-------------------------------------------------------- + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-conformer-en-2023-03-18.tar.bz2 + + tar xvf sherpa-onnx-conformer-en-2023-03-18.tar.bz2 + rm sherpa-onnx-conformer-en-2023-03-18.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-en-2023-03-18$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 1.3M Apr 1 07:02 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 2.0M Apr 1 07:02 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 122M Apr 1 07:02 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 315M Apr 1 07:02 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 254K Apr 1 07:02 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 1003K Apr 1 07:02 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-conformer-en-2023-03-18/tokens.txt \ + --encoder=./sherpa-onnx-conformer-en-2023-03-18/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-conformer-en-2023-03-18/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-en-2023-03-18/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-conformer-en-2023-03-18/test_wavs/0.wav \ + ./sherpa-onnx-conformer-en-2023-03-18/test_wavs/1.wav \ + ./sherpa-onnx-conformer-en-2023-03-18/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-conformer-en-2023-03-18.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-conformer-en-2023-03-18/tokens.txt \ + --encoder=./sherpa-onnx-conformer-en-2023-03-18/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-conformer-en-2023-03-18/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-en-2023-03-18/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-conformer-en-2023-03-18/test_wavs/0.wav \ + ./sherpa-onnx-conformer-en-2023-03-18/test_wavs/1.wav \ + ./sherpa-onnx-conformer-en-2023-03-18/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-conformer-en-2023-03-18-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-conformer-en-2023-03-18/tokens.txt \ + --encoder=./sherpa-onnx-conformer-en-2023-03-18/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-conformer-en-2023-03-18/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-conformer-en-2023-03-18/joiner-epoch-99-avg-1.onnx diff --git a/docs/source/onnx/pretrained_models/offline-transducer/index.rst b/docs/source/onnx/pretrained_models/offline-transducer/index.rst new file mode 100644 index 000000000..c107db9d7 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/index.rst @@ -0,0 +1,14 @@ +.. _sherpa-onnx-offline-transducer-models: + +Offline transducer models +========================= + +This section lists available offline transducer models. + +.. toctree:: + :maxdepth: 5 + + zipformer-transducer-models + conformer-transducer-models + nemo-transducer-models.rst + diff --git a/docs/source/onnx/pretrained_models/offline-transducer/nemo-transducer-models.rst b/docs/source/onnx/pretrained_models/offline-transducer/nemo-transducer-models.rst new file mode 100644 index 000000000..b5cc4d407 --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/nemo-transducer-models.rst @@ -0,0 +1,123 @@ +.. _sherpa_onnx_offline_nemo_transducer_models: + +NeMo transducer-based Models +============================ + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx` + before you read this section. + +sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24 (Russian, 俄语) +---------------------------------------------------------------------- + +This model is converted from + + ``_ + +You can find the conversion script at + + ``_ + +.. warning:: + + The license of the model can be found at ``_. + + It is for non-commercial use only. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.tar.bz2 + tar xvf sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.tar.bz2 + rm sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/ + total 548472 + -rw-r--r-- 1 fangjun staff 89K Oct 25 13:36 GigaAM%20License_NC.pdf + -rw-r--r-- 1 fangjun staff 318B Oct 25 13:37 README.md + -rw-r--r-- 1 fangjun staff 3.8M Oct 25 13:36 decoder.onnx + -rw-r--r-- 1 fangjun staff 262M Oct 25 13:37 encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 3.8K Oct 25 13:32 export-onnx-rnnt.py + -rw-r--r-- 1 fangjun staff 2.0M Oct 25 13:36 joiner.onnx + -rwxr-xr-x 1 fangjun staff 2.0K Oct 25 13:32 run-rnnt.sh + -rwxr-xr-x 1 fangjun staff 8.7K Oct 25 13:32 test-onnx-rnnt.py + drwxr-xr-x 4 fangjun staff 128B Oct 25 13:37 test_wavs + -rw-r--r-- 1 fangjun staff 5.8K Oct 25 13:36 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --encoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/encoder.int8.onnx \ + --decoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/decoder.onnx \ + --joiner=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/joiner.onnx \ + --tokens=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/tokens.txt \ + --model-type=nemo_transducer \ + ./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/test_wavs/example.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-nemo/sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24.int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --encoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/encoder.int8.onnx \ + --decoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/decoder.onnx \ + --joiner=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/joiner.onnx \ + --tokens=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/tokens.txt \ + --model-type=nemo_transducer + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --encoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/encoder.int8.onnx \ + --decoder=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/decoder.onnx \ + --joiner=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/joiner.onnx \ + --tokens=./sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/tokens.txt \ + --model-type=nemo_transducer diff --git a/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst b/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst new file mode 100644 index 000000000..7ca21428c --- /dev/null +++ b/docs/source/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.rst @@ -0,0 +1,2172 @@ +.. _sherpa_onnx_offline_zipformer_transducer_models: + +Zipformer-transducer-based Models +================================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx` + before you read this section. + +sherpa-onnx-zipformer-zh-en-2023-11-22 (Chinese+English, 中英双语) +------------------------------------------------------------------------------------------ + +This model is from ``_. + +See ``_ if you want to learn +how the model is trained. + +Note that this model uses byte-level BPE. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-zh-en-2023-11-22.tar.bz2 + tar xvf sherpa-onnx-zipformer-zh-en-2023-11-22.tar.bz2 + rm sherpa-onnx-zipformer-zh-en-2023-11-22.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-zipformer-zh-en-2023-11-22 + total 710824 + -rw-r--r-- 1 fangjun staff 264K Nov 22 2023 bbpe.model + -rw-r--r-- 1 fangjun staff 4.9M Nov 22 2023 decoder-epoch-34-avg-19.onnx + -rw-r--r-- 1 fangjun staff 66M Nov 22 2023 encoder-epoch-34-avg-19.int8.onnx + -rw-r--r-- 1 fangjun staff 248M Nov 22 2023 encoder-epoch-34-avg-19.onnx + -rw-r--r-- 1 fangjun staff 1.0M Nov 22 2023 joiner-epoch-34-avg-19.int8.onnx + -rw-r--r-- 1 fangjun staff 3.9M Nov 22 2023 joiner-epoch-34-avg-19.onnx + drwxr-xr-x 5 fangjun staff 160B Dec 24 15:50 test_wavs + -rw-r--r-- 1 fangjun staff 25K Dec 24 15:49 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.onnx \ + --decoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx \ + --joiner=./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.onnx \ + --num-threads=1 \ + ./sherpa-onnx-zipformer-zh-en-2023-11-22/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx \ + --joiner=./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.int8.onnx \ + --num-threads=1 \ + ./sherpa-onnx-zipformer-zh-en-2023-11-22/test_wavs/0.wav + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-zh-en-2023-11-22-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.onnx \ + --decoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx \ + --joiner=./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.onnx + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-zipformer-zh-en-2023-11-22/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/encoder-epoch-34-avg-19.onnx \ + --decoder=./sherpa-onnx-zipformer-zh-en-2023-11-22/decoder-epoch-34-avg-19.onnx \ + --joiner=./sherpa-onnx-zipformer-zh-en-2023-11-22/joiner-epoch-34-avg-19.onnx + + +sherpa-onnx-zipformer-ru-2024-09-18 (Russian, 俄语) +--------------------------------------------------- + +This model is from ``_. + +You can find the export script at ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ru-2024-09-18.tar.bz2 + tar xvf sherpa-onnx-zipformer-ru-2024-09-18.tar.bz2 + rm sherpa-onnx-zipformer-ru-2024-09-18.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-zipformer-ru-2024-09-18 + total 700352 + -rw-r--r-- 1 fangjun staff 240K Sep 18 12:01 bpe.model + -rw-r--r-- 1 fangjun staff 1.2M Sep 18 12:01 decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Sep 18 12:01 decoder.onnx + -rw-r--r-- 1 fangjun staff 65M Sep 18 12:01 encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 247M Sep 18 12:01 encoder.onnx + -rw-r--r-- 1 fangjun staff 253K Sep 18 12:01 joiner.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Sep 18 12:01 joiner.onnx + drwxr-xr-x 4 fangjun staff 128B Sep 18 12:01 test_wavs + -rw-r--r-- 1 fangjun staff 6.2K Sep 18 12:01 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ru-2024-09-18/encoder.onnx \ + --decoder=./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-zipformer-ru-2024-09-18/joiner.onnx \ + --num-threads=1 \ + ./sherpa-onnx-zipformer-ru-2024-09-18/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ru-2024-09-18/encoder.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-zipformer-ru-2024-09-18/joiner.int8.onnx \ + --num-threads=1 \ + ./sherpa-onnx-zipformer-ru-2024-09-18/test_wavs/1.wav + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-ru-2024-09-18.int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ru-2024-09-18/encoder.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-zipformer-ru-2024-09-18/joiner.int8.onnx + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ru-2024-09-18/encoder.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-zipformer-ru-2024-09-18/joiner.int8.onnx + +sherpa-onnx-small-zipformer-ru-2024-09-18 (Russian, 俄语) +--------------------------------------------------------- + +This model is from ``_. + +You can find the export script at ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-small-zipformer-ru-2024-09-18.tar.bz2 + tar xvf sherpa-onnx-small-zipformer-ru-2024-09-18.tar.bz2 + rm sherpa-onnx-small-zipformer-ru-2024-09-18.tar.bz2 + +You should see something like below after downloading:: + + ls -lh sherpa-onnx-small-zipformer-ru-2024-09-18/ + total 257992 + -rw-r--r-- 1 fangjun staff 240K Sep 18 12:02 bpe.model + -rw-r--r-- 1 fangjun staff 1.2M Sep 18 12:02 decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Sep 18 12:02 decoder.onnx + -rw-r--r-- 1 fangjun staff 24M Sep 18 12:02 encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 86M Sep 18 12:02 encoder.onnx + -rw-r--r-- 1 fangjun staff 253K Sep 18 12:02 joiner.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Sep 18 12:02 joiner.onnx + drwxr-xr-x 4 fangjun staff 128B Sep 18 12:02 test_wavs + -rw-r--r-- 1 fangjun staff 6.2K Sep 18 12:02 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.onnx \ + --decoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.onnx \ + --num-threads=1 \ + ./sherpa-onnx-small-zipformer-ru-2024-09-18/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.int8.onnx \ + --decoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.int8.onnx \ + --num-threads=1 \ + ./sherpa-onnx-small-zipformer-ru-2024-09-18/test_wavs/1.wav + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-small-zipformer-ru-2024-09-18.int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.int8.onnx \ + --decoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.int8.onnx + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-small-zipformer-ru-2024-09-18/tokens.txt \ + --encoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/encoder.int8.onnx \ + --decoder=./sherpa-onnx-small-zipformer-ru-2024-09-18/decoder.onnx \ + --joiner=./sherpa-onnx-small-zipformer-ru-2024-09-18/joiner.int8.onnx + +sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01 (Japanese, 日语) +------------------------------------------------------------------ + +This model is from `ReazonSpeech`_ and supports only Japanese. +It is trained by 35k hours of data. + +The code for training the model can be found at +``_ + +Paper about the dataset is ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + + +.. hint:: + + The original onnx model is from + + ``_ + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 + rm sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2 + + ls -lh sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01 + +You should see the following output: + +.. code-block:: bash + + -rw-r--r-- 1 fangjun staff 1.2K Aug 1 18:32 README.md + -rw-r--r-- 1 fangjun staff 2.8M Aug 1 18:32 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 11M Aug 1 18:32 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 148M Aug 1 18:32 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 565M Aug 1 18:32 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 2.6M Aug 1 18:32 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 10M Aug 1 18:32 joiner-epoch-99-avg-1.onnx + drwxr-xr-x 8 fangjun staff 256B Aug 1 18:31 test_wavs + -rw-r--r-- 1 fangjun staff 45K Aug 1 18:32 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.onnx \ + --num-threads=1 \ + ./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.int8.onnx \ + --num-threads=1 \ + ./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.int8.onnx + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/joiner-epoch-99-avg-1.int8.onnx + +sherpa-onnx-zipformer-korean-2024-06-24 (Korean, 韩语) +------------------------------------------------------------ + +PyTorch checkpoints of this model can be found at +``_. + +The training dataset can be found at ``_. + +Paper about the dataset is ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-korean-2024-06-24.tar.bz2 + + tar xf sherpa-onnx-zipformer-korean-2024-06-24.tar.bz2 + rm sherpa-onnx-zipformer-korean-2024-06-24.tar.bz2 + + ls -lh sherpa-onnx-zipformer-korean-2024-06-24 + +You should see the following output: + +.. code-block:: bash + + -rw-r--r-- 1 fangjun staff 307K Jun 24 15:33 bpe.model + -rw-r--r-- 1 fangjun staff 2.7M Jun 24 15:33 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 11M Jun 24 15:33 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 68M Jun 24 15:33 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 249M Jun 24 15:33 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 2.5M Jun 24 15:33 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 9.8M Jun 24 15:33 joiner-epoch-99-avg-1.onnx + drwxr-xr-x 7 fangjun staff 224B Jun 24 15:32 test_wavs + -rw-r--r-- 1 fangjun staff 59K Jun 24 15:33 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-korean-2024-06-24/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-korean-2024-06-24/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-korean-2024-06-24-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-korean-2024-06-24/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-korean-2024-06-24/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-korean-2024-06-24/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-korean-2024-06-24/joiner-epoch-99-avg-1.int8.onnx + +sherpa-onnx-zipformer-thai-2024-06-20 (Thai, 泰语) +------------------------------------------------------------ + +PyTorch checkpoints of this model can be found at +``_. + +The training dataset can be found at ``_. + +The paper about the dataset is ``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-thai-2024-06-20.tar.bz2 + + tar xf sherpa-onnx-zipformer-thai-2024-06-20.tar.bz2 + rm sherpa-onnx-zipformer-thai-2024-06-20.tar.bz2 + + ls -lh sherpa-onnx-zipformer-thai-2024-06-20 + +You should see the following output: + +.. code-block:: bash + + -rw-r--r-- 1 fangjun staff 277K Jun 20 16:47 bpe.model + -rw-r--r-- 1 fangjun staff 1.2M Jun 20 16:47 decoder-epoch-12-avg-5.int8.onnx + -rw-r--r-- 1 fangjun staff 4.9M Jun 20 16:47 decoder-epoch-12-avg-5.onnx + -rw-r--r-- 1 fangjun staff 148M Jun 20 16:47 encoder-epoch-12-avg-5.int8.onnx + -rw-r--r-- 1 fangjun staff 565M Jun 20 16:47 encoder-epoch-12-avg-5.onnx + -rw-r--r-- 1 fangjun staff 1.0M Jun 20 16:47 joiner-epoch-12-avg-5.int8.onnx + -rw-r--r-- 1 fangjun staff 3.9M Jun 20 16:47 joiner-epoch-12-avg-5.onnx + drwxr-xr-x 6 fangjun staff 192B Jun 20 16:46 test_wavs + -rw-r--r-- 1 fangjun staff 38K Jun 20 16:47 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.onnx \ + --decoder=./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx \ + --joiner=./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.onnx \ + ./sherpa-onnx-zipformer-thai-2024-06-20/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx \ + --joiner=./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.int8.onnx \ + ./sherpa-onnx-zipformer-thai-2024-06-20/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-thai-2024-06-20-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-thai-2024-06-20/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-thai-2024-06-20/encoder-epoch-12-avg-5.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-thai-2024-06-20/decoder-epoch-12-avg-5.onnx \ + --joiner=./sherpa-onnx-zipformer-thai-2024-06-20/joiner-epoch-12-avg-5.int8.onnx + + +sherpa-onnx-zipformer-cantonese-2024-03-13 (Cantonese, 粤语) +------------------------------------------------------------ + +Training code for this model can be found at +``_. +It supports only Cantonese since it is trained on a ``Canatonese`` dataset. +The paper for the dataset can be found at ``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-cantonese-2024-03-13.tar.bz2 + + tar xf sherpa-onnx-zipformer-cantonese-2024-03-13.tar.bz2 + rm sherpa-onnx-zipformer-cantonese-2024-03-13.tar.bz2 + + ls -lh sherpa-onnx-zipformer-cantonese-2024-03-13 + +You should see the following output: + +.. code-block:: bash + + total 340M + -rw-r--r-- 1 1001 127 2.7M Mar 13 09:06 decoder-epoch-45-avg-35.int8.onnx + -rw-r--r-- 1 1001 127 11M Mar 13 09:06 decoder-epoch-45-avg-35.onnx + -rw-r--r-- 1 1001 127 67M Mar 13 09:06 encoder-epoch-45-avg-35.int8.onnx + -rw-r--r-- 1 1001 127 248M Mar 13 09:06 encoder-epoch-45-avg-35.onnx + -rw-r--r-- 1 1001 127 2.4M Mar 13 09:06 joiner-epoch-45-avg-35.int8.onnx + -rw-r--r-- 1 1001 127 9.5M Mar 13 09:06 joiner-epoch-45-avg-35.onnx + drwxr-xr-x 2 1001 127 4.0K Mar 13 09:06 test_wavs + -rw-r--r-- 1 1001 127 42K Mar 13 09:06 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --blank-penalty=1.2 \ + --tokens=./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.onnx \ + --decoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx \ + --joiner=./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.onnx \ + ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_1.wav \ + ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_2.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --blank-penalty=1.2 \ + --tokens=./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx \ + --joiner=./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.int8.onnx \ + ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_1.wav \ + ./sherpa-onnx-zipformer-cantonese-2024-03-13/test_wavs/test_wavs_2.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-cantonese-2024-03-13-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-cantonese-2024-03-13/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/encoder-epoch-45-avg-35.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-cantonese-2024-03-13/decoder-epoch-45-avg-35.onnx \ + --joiner=./sherpa-onnx-zipformer-cantonese-2024-03-13/joiner-epoch-45-avg-35.int8.onnx + + +sherpa-onnx-zipformer-gigaspeech-2023-12-12 (English) +----------------------------------------------------- + +Training code for this model is ``_. +It supports only English since it is trained on the `GigaSpeech`_ dataset. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + + tar xf sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + rm sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + ls -lh sherpa-onnx-zipformer-gigaspeech-2023-12-12 + +You should see the following output: + +.. code-block:: bash + + $ ls -lh sherpa-onnx-zipformer-gigaspeech-2023-12-12 + total 656184 + -rw-r--r-- 1 fangjun staff 28B Dec 12 19:00 README.md + -rw-r--r-- 1 fangjun staff 239K Dec 12 19:00 bpe.model + -rw-r--r-- 1 fangjun staff 528K Dec 12 19:00 decoder-epoch-30-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Dec 12 19:00 decoder-epoch-30-avg-1.onnx + -rw-r--r-- 1 fangjun staff 68M Dec 12 19:00 encoder-epoch-30-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 249M Dec 12 19:00 encoder-epoch-30-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Dec 12 19:00 joiner-epoch-30-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Dec 12 19:00 joiner-epoch-30-avg-1.onnx + drwxr-xr-x 5 fangjun staff 160B Dec 12 19:00 test_wavs + -rw-r--r-- 1 fangjun staff 4.9K Dec 12 19:00 tokens.txt + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1089-134686-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav \ + ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-gigaspeech-2023-12-12-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx + +Speech recognition from a microphone with VAD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx + +zrjin/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2 (Chinese) +------------------------------------------------------------ + +This model is from + +``_ + +which supports Chinese as it is trained on whatever datasets involved in the `multi-zh_hans `_ recipe. + +If you are interested in how the model is trained, please refer to +``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 + + tar xvf sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 + rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-zipformer-multi-zh-hans-2023-9-2 zengruijin$ ls -lh *.onnx + -rw-rw-r--@ 1 zengruijin staff 1.2M Sep 18 07:04 decoder-epoch-20-avg-1.int8.onnx + -rw-rw-r--@ 1 zengruijin staff 4.9M Sep 18 07:04 decoder-epoch-20-avg-1.onnx + -rw-rw-r--@ 1 zengruijin staff 66M Sep 18 07:04 encoder-epoch-20-avg-1.int8.onnx + -rw-rw-r--@ 1 zengruijin staff 248M Sep 18 07:05 encoder-epoch-20-avg-1.onnx + -rw-rw-r--@ 1 zengruijin staff 1.0M Sep 18 07:05 joiner-epoch-20-avg-1.int8.onnx + -rw-rw-r--@ 1 zengruijin staff 3.9M Sep 18 07:05 joiner-epoch-20-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-20-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.onnx \ + ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-20-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-multi-zh-hans-2023-9-2-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/encoder-epoch-20-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/decoder-epoch-0-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/joiner-epoch-20-avg-1.onnx + + +yfyeung/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17 (English) +-------------------------------------------------------------------------------------------------- + +This model is from + +``_ + +which supports only English as it is trained on the `CommonVoice`_ English dataset. + +If you are interested in how the model is trained, please refer to +``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.tar.bz2 + + tar xvf icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.tar.bz2 + rm icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.tar.bz2 + + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17 fangjun$ ls -lh exp/*epoch-60-avg-20*.onnx + -rw-r--r-- 1 fangjun staff 1.2M Jun 27 09:53 exp/decoder-epoch-60-avg-20.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Jun 27 09:54 exp/decoder-epoch-60-avg-20.onnx + -rw-r--r-- 1 fangjun staff 121M Jun 27 09:54 exp/encoder-epoch-60-avg-20.int8.onnx + -rw-r--r-- 1 fangjun staff 279M Jun 27 09:55 exp/encoder-epoch-60-avg-20.onnx + -rw-r--r-- 1 fangjun staff 253K Jun 27 09:53 exp/joiner-epoch-60-avg-20.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Jun 27 09:53 exp/joiner-epoch-60-avg-20.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt \ + --encoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.onnx \ + --decoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx \ + --joiner=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.onnx \ + ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt \ + --encoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.int8.onnx \ + --decoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx \ + --joiner=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.int8.onnx \ + ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/data/lang_bpe_500/tokens.txt \ + --encoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/encoder-epoch-60-avg-20.onnx \ + --decoder=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/decoder-epoch-60-avg-20.onnx \ + --joiner=./icefall-asr-cv-corpus-13.0-2023-03-09-en-pruned-transducer-stateless7-2023-04-17/exp/joiner-epoch-60-avg-20.onnx + + +.. _sherpa-onnx-wenetspeech-small: + +k2-fsa/icefall-asr-zipformer-wenetspeech-small (Chinese) +-------------------------------------------------------- + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +In the following, we describe how to download it. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + git lfs install + git clone https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-small + + +.. _sherpa-onnx-wenetspeech-large: + +k2-fsa/icefall-asr-zipformer-wenetspeech-large (Chinese) +-------------------------------------------------------- + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +In the following, we describe how to download it. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + git lfs install + git clone https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-large + + +pkufool/icefall-asr-zipformer-wenetspeech-20230615 (Chinese) +------------------------------------------------------------ + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +If you are interested in how the model is trained, please refer to +``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/icefall-asr-zipformer-wenetspeech-20230615.tar.bz2 + + tar xvf icefall-asr-zipformer-wenetspeech-20230615.tar.bz2 + rm icefall-asr-zipformer-wenetspeech-20230615.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + icefall-asr-zipformer-wenetspeech-20230615 fangjun$ ls -lh exp/*.onnx + -rw-r--r-- 1 fangjun staff 11M Jun 26 14:31 exp/decoder-epoch-12-avg-4.int8.onnx + -rw-r--r-- 1 fangjun staff 12M Jun 26 14:31 exp/decoder-epoch-12-avg-4.onnx + -rw-r--r-- 1 fangjun staff 66M Jun 26 14:32 exp/encoder-epoch-12-avg-4.int8.onnx + -rw-r--r-- 1 fangjun staff 248M Jun 26 14:34 exp/encoder-epoch-12-avg-4.onnx + -rw-r--r-- 1 fangjun staff 2.7M Jun 26 14:31 exp/joiner-epoch-12-avg-4.int8.onnx + -rw-r--r-- 1 fangjun staff 11M Jun 26 14:31 exp/joiner-epoch-12-avg-4.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.onnx \ + --decoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx \ + --joiner=./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.onnx \ + ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav \ + ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000001.wav \ + ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/icefall-asr-zipformer-wenetspeech-20230615.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.int8.onnx \ + --decoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx \ + --joiner=./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.int8.onnx \ + ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav \ + ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000001.wav \ + ./icefall-asr-zipformer-wenetspeech-20230615/test_wavs/DEV_T0000000002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/icefall-asr-zipformer-wenetspeech-20230615-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./icefall-asr-zipformer-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/encoder-epoch-12-avg-4.onnx \ + --decoder=./icefall-asr-zipformer-wenetspeech-20230615/exp/decoder-epoch-12-avg-4.onnx \ + --joiner=./icefall-asr-zipformer-wenetspeech-20230615/exp/joiner-epoch-12-avg-4.onnx + + +csukuangfj/sherpa-onnx-zipformer-large-en-2023-06-26 (English) +-------------------------------------------------------------- + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-large-en-2023-06-26.tar.bz2 + + tar xvf sherpa-onnx-zipformer-large-en-2023-06-26.tar.bz2 + rm sherpa-onnx-zipformer-large-en-2023-06-26.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-zipformer-large-en-2023-06-26 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 1.2M Jun 26 13:19 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Jun 26 13:19 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 145M Jun 26 13:20 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 564M Jun 26 13:22 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Jun 26 13:19 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Jun 26 13:19 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-large-en-2023-06-26/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-large-en-2023-06-26-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-large-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-large-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-large-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-large-en-2023-06-26/joiner-epoch-99-avg-1.onnx + +csukuangfj/sherpa-onnx-zipformer-small-en-2023-06-26 (English) +-------------------------------------------------------------- + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-small-en-2023-06-26.tar.bz2 + + tar xvf sherpa-onnx-zipformer-small-en-2023-06-26.tar.bz2 + rm sherpa-onnx-zipformer-small-en-2023-06-26.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-zipformer-small-en-2023-06-26 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 1.2M Jun 26 13:04 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Jun 26 13:04 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 25M Jun 26 13:04 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 87M Jun 26 13:04 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Jun 26 13:04 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Jun 26 13:04 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-small-en-2023-06-26-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.onnx + +.. _sherpa-onnx-zipformer-en-2023-06-26-english: + +csukuangfj/sherpa-onnx-zipformer-en-2023-06-26 (English) +-------------------------------------------------------- + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + + tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-zipformer-en-2023-06-26 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 1.2M Jun 26 12:45 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Jun 26 12:45 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 66M Jun 26 12:45 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 248M Jun 26 12:46 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Jun 26 12:45 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Jun 26 12:45 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-en-2023-06-26.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-en-2023-06-26-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx + + +.. _icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04-english: + +icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 (English) +-------------------------------------------------------------------------- + +This model is trained using GigaSpeech + LibriSpeech + Common Voice 13.0 with zipformer + +See ``_ if you are interested in how +it is trained. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04.tar.bz2 + + tar xvf icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04.tar.bz2 + rm icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + $ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 1.2M May 15 11:11 decoder-epoch-30-avg-4.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M May 15 11:11 decoder-epoch-30-avg-4.onnx + -rw-r--r-- 1 fangjun staff 121M May 15 11:12 encoder-epoch-30-avg-4.int8.onnx + -rw-r--r-- 1 fangjun staff 279M May 15 11:13 encoder-epoch-30-avg-4.onnx + -rw-r--r-- 1 fangjun staff 253K May 15 11:11 joiner-epoch-30-avg-4.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M May 15 11:11 joiner-epoch-30-avg-4.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt \ + --encoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.onnx \ + --decoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx \ + --joiner=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.onnx \ + ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt \ + --encoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.int8.onnx \ + --decoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx \ + --joiner=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.int8.onnx \ + ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/test_wavs/1221-135766-0002.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-multi-dataset-2023-05-04-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/data/lang_bpe_500/tokens.txt \ + --encoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/encoder-epoch-30-avg-4.onnx \ + --decoder=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/decoder-epoch-30-avg-4.onnx \ + --joiner=./icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04/exp/joiner-epoch-30-avg-4.onnx + +.. _sherpa_onnx_zipformer_en_2023_04_01: + +csukuangfj/sherpa-onnx-zipformer-en-2023-04-01 (English) +-------------------------------------------------------- + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ and `GigaSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + + tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-zipformer-en-2023-04-01$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 1.3M Apr 1 14:34 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 2.0M Apr 1 14:34 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 180M Apr 1 14:34 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 338M Apr 1 14:34 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 254K Apr 1 14:34 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 1003K Apr 1 14:34 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-en-2023-04-01.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-en-2023-04-01-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.onnx + +.. _sherpa_onnx_zipformer_en_2023_03_30: + +csukuangfj/sherpa-onnx-zipformer-en-2023-03-30 (English) +-------------------------------------------------------- + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-03-30.tar.bz2 + + tar xvf sherpa-onnx-zipformer-en-2023-03-30.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-03-30.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-zipformer-en-2023-03-30$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 1.3M Mar 31 00:37 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 2.0M Mar 30 20:10 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 180M Mar 31 00:37 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 338M Mar 30 20:10 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 254K Mar 31 00:37 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 1003K Mar 30 20:10 joiner-epoch-99-avg-1.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-03-30/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-03-30/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-03-30/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-03-30/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-en-2023-03-30.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode wave files: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-03-30/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-03-30/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-03-30/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-03-30/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-03-30/test_wavs/8k.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-zipformer-en-2023-03-30-int8.txt + +Speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-zipformer-en-2023-03-30/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-03-30/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-03-30/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-03-30/joiner-epoch-99-avg-1.onnx diff --git a/docs/source/onnx/pretrained_models/online-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt b/docs/source/onnx/pretrained_models/online-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt new file mode 100644 index 000000000..c91183270 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx"), tokens="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.44, Real time factor (RTF): 0.078 + 对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.00, 0.52, 0.76, 0.84, 1.04, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.84], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt b/docs/source/onnx/pretrained_models/online-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt new file mode 100644 index 000000000..0a24dbb6e --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-ctc/code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx"), tokens="./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.66, Real time factor (RTF): 0.12 + 对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.00, 0.52, 0.76, 0.84, 1.08, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.80], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-ctc/index.rst b/docs/source/onnx/pretrained_models/online-ctc/index.rst new file mode 100644 index 000000000..081164c43 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-ctc/index.rst @@ -0,0 +1,11 @@ +.. _onnx_online_ctc_models: + +Online CTC models +================= + +This section lists available online CTC models. + +.. toctree:: + :maxdepth: 5 + + zipformer-ctc-models diff --git a/docs/source/onnx/pretrained_models/online-ctc/zipformer-ctc-models.rst b/docs/source/onnx/pretrained_models/online-ctc/zipformer-ctc-models.rst new file mode 100644 index 000000000..d59c3298d --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-ctc/zipformer-ctc-models.rst @@ -0,0 +1,137 @@ +.. _sherpa_onnx_zipformer_ctc_models: + +Zipformer-CTC-based Models +========================== + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 (Chinese) +---------------------------------------------------------------------- + +Training code for this model can be found at ``_. +It supports only Chinese. + +Please refer to ``_ +for the detailed information about the training data. In total, there are 14k hours of training data. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 + rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 + ls -lh sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 + +The output is given below: + +.. code-block:: + + $ ls -lh sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 + total 654136 + -rw-r--r--@ 1 fangjun staff 28B Dec 13 16:19 README.md + -rw-r--r--@ 1 fangjun staff 258K Dec 13 16:19 bpe.model + -rw-r--r--@ 1 fangjun staff 68M Dec 13 16:19 ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r--@ 1 fangjun staff 252M Dec 13 16:19 ctc-epoch-20-avg-1-chunk-16-left-128.onnx + drwxr-xr-x@ 8 fangjun staff 256B Dec 13 16:19 test_wavs + -rw-r--r--@ 1 fangjun staff 18K Dec 13 16:19 tokens.txt + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \ + ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \ + ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. diff --git a/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.int8.txt b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.int8.txt new file mode 100644 index 000000000..ccd8d5bef --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt --paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx --paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx", decoder="./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"), tokens="./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, context_score=1.5, decoding_method="greedy_search") +./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav +Elapsed seconds: 1.6, Real time factor (RTF): 0.15 +昨天是 monday today day is 零八二 the day after tomorrow 是星期三 +{"is_final":false,"segment":0,"start_time":0.0,"text":"昨天是 monday today day is 零八二 the day after tomorrow 是星期三","timestamps":"[]","tokens":["昨","天","是","mon@@","day","today","day","is","零","八","二","the","day","after","tom@@","or@@","row","是","星","期","三"]} + diff --git a/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.txt b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.txt new file mode 100644 index 000000000..6176e69ff --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt --paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx --paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx", decoder="./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx"), tokens="./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, context_score=1.5, decoding_method="greedy_search") +./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav +Elapsed seconds: 2.2, Real time factor (RTF): 0.21 +昨天是 monday today day is 零八二 the day after tomorrow 是星期三 +{"is_final":false,"segment":0,"start_time":0.0,"text":"昨天是 monday today day is 零八二 the day after tomorrow 是星期三","timestamps":"[]","tokens":["昨","天","是","mon@@","day","today","day","is","零","八","二","the","day","after","tom@@","or@@","row","是","星","期","三"]} + diff --git a/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.int8.txt b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.int8.txt new file mode 100644 index 000000000..812b066f8 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/tokens.txt --paraformer-encoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.int8.onnx --paraformer-decoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.int8.onnx ./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.int8.onnx", decoder="./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.int8.onnx"), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0) +./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav +Elapsed seconds: 0.84, Real time factor (RTF): 0.14 +有无人知道湾仔活道系点去 +{ "text": "有无人知道湾仔活道系点去", "tokens": [ "有", "无", "人", "知", "道", "湾", "仔", "活", "道", "系", "点", "去" ], "timestamps": [ ], "ys_probs": [ ], "lm_probs": [ ], "context_scores": [ ], "segment": 0, "start_time": 0.00, "is_final": false} + diff --git a/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.txt b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.txt new file mode 100644 index 000000000..1da84b806 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-paraformer/code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/tokens.txt --paraformer-encoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.int8.onnx --paraformer-decoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.int8.onnx ./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="", decoder="", joiner=""), paraformer=OnlineParaformerModelConfig(encoder="./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.int8.onnx", decoder="./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.int8.onnx"), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0) +./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav +Elapsed seconds: 0.98, Real time factor (RTF): 0.16 +有无人知道湾仔活道系点去 +{ "text": "有无人知道湾仔活道系点去", "tokens": [ "有", "无", "人", "知", "道", "湾", "仔", "活", "道", "系", "点", "去" ], "timestamps": [ ], "ys_probs": [ ], "lm_probs": [ ], "context_scores": [ ], "segment": 0, "start_time": 0.00, "is_final": false} + diff --git a/docs/source/onnx/pretrained_models/online-paraformer/index.rst b/docs/source/onnx/pretrained_models/online-paraformer/index.rst new file mode 100644 index 000000000..f92c180e1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-paraformer/index.rst @@ -0,0 +1,10 @@ +Online paraformer models +======================== + +This section lists available online paraformer models. + +.. toctree:: + :maxdepth: 5 + + paraformer-models + diff --git a/docs/source/onnx/pretrained_models/online-paraformer/paraformer-models.rst b/docs/source/onnx/pretrained_models/online-paraformer/paraformer-models.rst new file mode 100644 index 000000000..1192df332 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-paraformer/paraformer-models.rst @@ -0,0 +1,278 @@ +Paraformer models +================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +.. _sherpa_onnx_online_paraformer_bilingual_zh_en: + +csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en (Chinese + English) +------------------------------------------------------------------------------- + +.. note:: + + This model does not support timestamps. It is a bilingual model, supporting + both Chinese and English. (支持普通话、河南话、天津话、四川话等方言) + +This model is converted from + +``_ + +The code for converting can be found at + +``_ + + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + + tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-paraformer-bilingual-zh-en fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 68M Aug 14 09:53 decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 218M Aug 14 09:55 decoder.onnx + -rw-r--r-- 1 fangjun staff 158M Aug 14 09:54 encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 607M Aug 14 09:57 encoder.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \ + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx \ + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx \ + ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \ + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx \ + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx \ + ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-streaming-paraformer-bilingual-zh-en.int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \ + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx \ + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + +.. _sherpa_onnx_online_paraformer_trilingual_zh_yue_en: + +csukuangfj/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en (Chinese + Cantonese + English) +------------------------------------------------------------------------------------------------------ + +.. note:: + + This model does not support timestamps. It is a trilingual model, supporting + both Chinese and English. (支持普通话、``粤语``、河南话、天津话、四川话等方言) + +This model is converted from + +``_ + +You can find the conversion code after downloading and unzipping the model. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.tar.bz2 + + tar xvf sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 69M Feb 29 19:44 decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 218M Feb 29 19:44 decoder.onnx + -rw-r--r-- 1 fangjun staff 159M Feb 29 19:44 encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 607M Feb 29 19:44 encoder.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/tokens.txt \ + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.onnx \ + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.onnx \ + ./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/tokens.txt \ + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.int8.onnx \ + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.int8.onnx \ + ./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-paraformer/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en.int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \ + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/encoder.int8.onnx \ + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en/decoder.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23-int8.txt new file mode 100644 index 000000000..6222a1f91 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23-int8.txt @@ -0,0 +1,12 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-streaming-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.int8.onnx", tokens="./sherpa-onnx-streaming-conformer-zh-2023-05-23/tokens.txt", num_threads=2, debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-conformer-zh-2023-05-23/test_wavs/0.wav +wav duration (s): 5.611 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-conformer-zh-2023-05-23/test_wavs/0.wav: +{"is_final":false,"segment":0,"start_time":0.0,"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.48, 0.72, 0.88, 1.08, 1.24, 2.00, 2.04, 2.16, 2.28, 2.56, 2.72, 2.92, 3.36, 3.44, 3.60, 3.72, 3.84, 3.92, 4.04, 4.16, 4.28, 4.48, 4.60, 4.84, 5.16]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.493 s +Real time factor (RTF): 0.493 / 5.611 = 0.088 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23.txt b/docs/source/onnx/pretrained_models/online-transducer/code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23.txt new file mode 100644 index 000000000..f89f8b440 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23.txt @@ -0,0 +1,12 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-streaming-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx", tokens="./sherpa-onnx-streaming-conformer-zh-2023-05-23/tokens.txt", num_threads=2, debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-conformer-zh-2023-05-23/test_wavs/0.wav +wav duration (s): 5.611 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-conformer-zh-2023-05-23/test_wavs/0.wav: +{"is_final":false,"segment":0,"start_time":0.0,"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣呢","timestamps":"[0.00, 0.48, 0.76, 0.88, 1.08, 1.24, 2.00, 2.04, 2.16, 2.36, 2.56, 2.72, 2.92, 3.36, 3.44, 3.64, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.48, 4.64, 4.84, 5.16]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.559 s +Real time factor (RTF): 0.559 / 5.611 = 0.100 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-en-2023-02-17-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-en-2023-02-17-int8.txt new file mode 100644 index 000000000..f0745d63a --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-en-2023-02-17-int8.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-lstm-en-2023-02-17/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-lstm-en-2023-02-17/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-lstm-en-2023-02-17/joiner-epoch-99-avg-1.int8.onnx", tokens="./sherpa-onnx-lstm-en-2023-02-17/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-03-31 22:55:46.608941959 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 578689, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-03-31 22:55:46.608939862 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 578688, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-lstm-en-2023-02-17/test_wavs/0.wav +wav duration (s): 6.625 +Started +Done! +Recognition result for ./sherpa-onnx-lstm-en-2023-02-17/test_wavs/0.wav: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.009 s +Real time factor (RTF): 1.009 / 6.625 = 0.152 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-en-2023-02-17.txt b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-en-2023-02-17.txt new file mode 100644 index 000000000..228a2c597 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-en-2023-02-17.txt @@ -0,0 +1,15 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-lstm-en-2023-02-17/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-lstm-en-2023-02-17/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-lstm-en-2023-02-17/joiner-epoch-99-avg-1.onnx", tokens="./sherpa-onnx-lstm-en-2023-02-17/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, +max_active_paths=4, decoding_method="greedy_search") +2023-03-31 22:53:22.120185169 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 576406, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-03-31 22:53:22.120183162 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 576405, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-lstm-en-2023-02-17/test_wavs/0.wav +wav duration (s): 6.625 +Started +Done! +Recognition result for ./sherpa-onnx-lstm-en-2023-02-17/test_wavs/0.wav: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 2.927 s +Real time factor (RTF): 2.927 / 6.625 = 0.442 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-zh-2023-02-20-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-zh-2023-02-20-int8.txt new file mode 100644 index 000000000..a1a118cc1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-zh-2023-02-20-int8.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-lstm-zh-2023-02-20/encoder-epoch-11-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-lstm-zh-2023-02-20/decoder-epoch-11-avg-1.onnx", joiner_filename="./sherpa-onnx-lstm-zh-2023-02-20/joiner-epoch-11-avg-1.int8.onnx", tokens="./sherpa-onnx-lstm-zh-2023-02-20/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-03-31 23:01:05.737519659 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 578880, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-03-31 23:01:05.737521655 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 578881, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-lstm-zh-2023-02-20/test_wavs/0.wav +wav duration (s): 5.611 +Started +Done! +Recognition result for ./sherpa-onnx-lstm-zh-2023-02-20/test_wavs/0.wav: +对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 1.091 s +Real time factor (RTF): 1.091 / 5.611 = 0.194 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-zh-2023-02-20.txt b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-zh-2023-02-20.txt new file mode 100644 index 000000000..2ae8f14ac --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-lstm/sherpa-onnx-lstm-zh-2023-02-20.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-lstm-zh-2023-02-20/encoder-epoch-11-avg-1.onnx", decoder_filename="./sherpa-onnx-lstm-zh-2023-02-20/decoder-epoch-11-avg-1.onnx", joiner_filename="./sherpa-onnx-lstm-zh-2023-02-20/joiner-epoch-11-avg-1.onnx", tokens="./sherpa-onnx-lstm-zh-2023-02-20/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-03-31 22:58:59.348229346 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 578800, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-03-31 22:58:59.348231417 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 578801, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-lstm-zh-2023-02-20/test_wavs/0.wav +wav duration (s): 5.611 +Started +Done! +Recognition result for ./sherpa-onnx-lstm-zh-2023-02-20/test_wavs/0.wav: +对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 3.030 s +Real time factor (RTF): 3.030 / 5.611 = 0.540 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615-int8.txt new file mode 100644 index 000000000..51b4f7e6e --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt --encoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx --decoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx --joiner=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx ./icefall-asr-zipformer-streaming-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx", decoder_filename="./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx", joiner_filename="./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx", tokens="./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt", num_threads=2, provider="cpu", debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +./icefall-asr-zipformer-streaming-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.38, Real time factor (RTF): 0.068 +对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢 +{"is_final":false,"segment":0,"start_time":0.0,"text":"对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢","timestamps":"[0.40, 0.52, 0.72, 0.84, 1.08, 1.24, 1.48, 1.92, 2.00, 2.24, 2.32, 2.48, 2.68, 2.80, 3.00, 3.28, 3.36, 3.60, 3.72, 3.84, 3.92, 4.00, 4.16, 4.28, 4.36, 4.64, 4.68, 5.00]","tokens":["对","我","做","了","介","绍","啊","那","么","我","想","说","的","是","呢","大","家","如","果","对","我","的","研","究","感","兴","趣","呢"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615.txt new file mode 100644 index 000000000..08e9cf458 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt --encoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx --decoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx --joiner=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx ./icefall-asr-zipformer-streaming-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx", decoder_filename="./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx", joiner_filename="./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx", tokens="./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt", num_threads=2, provider="cpu", debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +./icefall-asr-zipformer-streaming-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.56, Real time factor (RTF): 0.1 +对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false,"segment":0,"start_time":0.0,"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣","timestamps":"[0.36, 0.64, 0.76, 0.88, 1.08, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.92, 3.32, 3.40, 3.64, 3.72, 3.84, 3.96, 4.04, 4.16, 4.32, 4.48, 4.64, 4.72]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-int8.txt new file mode 100644 index 000000000..fc2a80e0e --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-int8.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx", tokens="./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-04-01 06:24:10.503505750 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604982, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 06:24:10.503503942 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604981, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav +wav duration (s): 5.100 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav: +这是第一种第二种叫呃与 ALWAYS ALWAYS什么意思啊 +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.551 s +Real time factor (RTF): 0.551 / 5.100 = 0.108 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.txt new file mode 100644 index 000000000..b78ef8bb9 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-04-01 06:22:23.030317206 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604942, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 06:22:23.030315351 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604941, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav +wav duration (s): 6.625 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.815 s +Real time factor (RTF): 0.815 / 6.625 = 0.123 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21-int8.txt new file mode 100644 index 000000000..a943a8bad --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21-int8.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.int8.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-04-01 06:18:47.466564998 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604880, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 06:18:47.466566863 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604881, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav +wav duration (s): 6.625 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.633 s +Real time factor (RTF): 0.633 / 6.625 = 0.096 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21.txt new file mode 100644 index 000000000..90cf81ab6 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21.txt @@ -0,0 +1,14 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +2023-04-01 06:16:29.128344485 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604840, index: 15, mask: {16, 52, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +2023-04-01 06:16:29.128346568 [E:onnxruntime:, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 604841, index: 16, mask: {17, 53, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set. +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav +wav duration (s): 6.625 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.825 s +Real time factor (RTF): 0.825 / 6.625 = 0.125 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21-int8.txt new file mode 100644 index 000000000..5fecf5aea --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-streaming-zipformer-en-2023-06-21/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.int8.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.int8.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt", num_threads=2, provider="cpu", debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-en-2023-06-21/test_wavs/0.wav +Elapsed seconds: 0.41, Real time factor (RTF): 0.062 + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +{"is_final":false,"segment":0,"start_time":0.0,"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.64, 1.00, 1.12, 1.20, 1.60, 1.76, 1.80, 1.96, 2.08, 2.24, 2.36, 2.40, 2.60, 2.72, 2.80, 2.88, 3.00, 3.20, 3.44, 3.68, 3.76, 3.96, 4.24, 4.52, 4.72, 4.76, 4.88, 5.04, 5.24, 5.28, 5.36, 5.48, 5.64, 5.76, 5.92, 5.96, 6.04, 6.24, 6.36]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21.txt new file mode 100644 index 000000000..94db7b9fe --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-streaming-zipformer-en-2023-06-21/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt", num_threads=2, provider="cpu", debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-en-2023-06-21/test_wavs/0.wav +Elapsed seconds: 0.5, Real time factor (RTF): 0.076 + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +{"is_final":false,"segment":0,"start_time":0.0,"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.64, 1.00, 1.12, 1.20, 1.60, 1.76, 1.84, 1.96, 2.08, 2.24, 2.36, 2.40, 2.60, 2.72, 2.80, 2.88, 3.00, 3.20, 3.44, 3.68, 3.76, 3.96, 4.24, 4.52, 4.72, 4.76, 4.88, 5.04, 5.24, 5.28, 5.36, 5.48, 5.64, 5.76, 5.92, 5.96, 6.04, 6.24, 6.36]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26-int8.txt new file mode 100644 index 000000000..23db462d4 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt", num_threads=2, provider="cpu", debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav +Elapsed seconds: 0.41, Real time factor (RTF): 0.062 + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +{"is_final":false,"segment":0,"start_time":0.0,"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.68, 1.04, 1.16, 1.24, 1.60, 1.76, 1.80, 1.92, 2.04, 2.24, 2.32, 2.36, 2.52, 2.68, 2.72, 2.80, 2.92, 3.12, 3.40, 3.64, 3.76, 3.92, 4.12, 4.48, 4.68, 4.72, 4.84, 5.00, 5.20, 5.24, 5.36, 5.44, 5.64, 5.76, 5.92, 5.96, 6.08, 6.24, 6.52]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26.txt new file mode 100644 index 000000000..47331f756 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx", tokens="./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt", num_threads=2, provider="cpu", debug=False), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav +Elapsed seconds: 0.51, Real time factor (RTF): 0.077 + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +{"is_final":false,"segment":0,"start_time":0.0,"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.68, 1.04, 1.16, 1.24, 1.60, 1.76, 1.80, 1.92, 2.04, 2.24, 2.32, 2.36, 2.52, 2.68, 2.72, 2.80, 2.92, 3.12, 3.40, 3.64, 3.76, 3.92, 4.12, 4.48, 4.68, 4.72, 4.84, 5.00, 5.20, 5.24, 5.36, 5.40, 5.64, 5.76, 5.92, 5.96, 6.08, 6.24, 6.52]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17-int8.txt new file mode 100644 index 000000000..ef4659e6d --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx", decoder="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), tokens="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, context_score=1.5, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav +Elapsed seconds: 0.25, Real time factor (RTF): 0.038 + THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLS +{"is_final":false,"segment":0,"start_time":0.0,"text":" THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLS","timestamps":"[2.04, 2.20, 2.28, 2.36, 2.52, 2.64, 2.68, 2.76, 2.92, 3.08, 3.40, 3.60, 3.72, 3.88, 4.12, 4.48, 4.64, 4.68, 4.84, 4.96, 5.16, 5.20, 5.32, 5.36, 5.60, 5.72, 5.92, 5.96, 6.08, 6.24, 6.36]","tokens":[" THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RA","FF","L","S"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.txt new file mode 100644 index 000000000..29acb6895 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx", decoder="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), tokens="./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, context_score=1.5, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav +Elapsed seconds: 0.32, Real time factor (RTF): 0.049 + THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLELS +{"is_final":false,"segment":0,"start_time":0.0,"text":" THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRAFFLELS","timestamps":"[2.04, 2.16, 2.28, 2.36, 2.52, 2.64, 2.68, 2.76, 2.92, 3.08, 3.40, 3.60, 3.72, 3.88, 4.12, 4.48, 4.64, 4.68, 4.84, 4.96, 5.16, 5.20, 5.32, 5.36, 5.60, 5.72, 5.92, 5.96, 6.08, 6.24, 6.36, 6.52]","tokens":[" THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RA","FF","L","EL","S"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14-int8.txt new file mode 100644 index 000000000..b52844bdc --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14-int8.txt @@ -0,0 +1,12 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/encoder-epoch-29-avg-9-with-averaged-model.int8.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/decoder-epoch-29-avg-9-with-averaged-model.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/joiner-epoch-29-avg-9-with-averaged-model.int8.onnx", tokens="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav +wav duration (s): 7.128 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav: + CE SITE CONTIENT QUATRE TOMBEAUX DE LA DYNASTIE ASHÉMÉNIDE ET SEPT DES SASSANDIDES +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.485 s +Real time factor (RTF): 0.485 / 7.128 = 0.068 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14.txt new file mode 100644 index 000000000..dc0f919d5 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14.txt @@ -0,0 +1,12 @@ +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineTransducerModelConfig(encoder_filename="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/encoder-epoch-29-avg-9-with-averaged-model.onnx", decoder_filename="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/decoder-epoch-29-avg-9-with-averaged-model.onnx", joiner_filename="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/joiner-epoch-29-avg-9-with-averaged-model.onnx", tokens="./sherpa-onnx-streaming-zipformer-fr-2023-04-14/tokens.txt", num_threads=2, debug=False), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, decoding_method="greedy_search") +sampling rate of input file: 16000 +wav filename: ./sherpa-onnx-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav +wav duration (s): 7.128 +Started +Done! +Recognition result for ./sherpa-onnx-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav: + CE SITE CONTIENT QUATRE TOMBEAUX DE LA DYNASTIE ASHÉMÉNIDE ET SEPT DES SASSANDIDES +num threads: 2 +decoding method: greedy_search +Elapsed seconds: 0.458 s +Real time factor (RTF): 0.458 / 7.128 = 0.064 diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16-int8.txt new file mode 100644 index 000000000..4b03056e3 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.int8.onnx", decoder="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), nemo_ctc=OnlineNeMoCtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt", num_threads=1, warm_up=0, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), ctc_fst_decoder_config=OnlineCtcFstDecoderConfig(graph="", max_active=3000), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0, temperature_scale=2) +./sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav +Elapsed seconds: 0.37, Real time factor (RTF): 0.1 + 걔는 괜찮은 척하려구 애 쓰는 거 같았다 +{ "text": " 걔는 괜찮은 척하려구 애 쓰는 거 같았다", "tokens": [" 걔는", " 괜찮은", " 척", "하", "려", "구", " 애", " 쓰는", " 거", " 같", "았", "다"], "timestamps": [0.52, 0.96, 1.28, 1.44, 1.52, 1.84, 2.28, 2.48, 2.88, 3.04, 3.20, 3.44], "ys_probs": [-1.750286, -0.241571, -0.621155, -1.862032, -1.977561, -0.789718, -1.002497, -1.627276, -0.554654, -0.443969, -0.852731, -0.218611], "lm_probs": [], "context_scores": [], "segment": 0, "words": [], "start_time": 0.00, "is_final": false} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16.txt new file mode 100644 index 000000000..becd8d482 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.onnx", decoder="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), nemo_ctc=OnlineNeMoCtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt", num_threads=1, warm_up=0, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), ctc_fst_decoder_config=OnlineCtcFstDecoderConfig(graph="", max_active=3000), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0, temperature_scale=2) +./sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav +Elapsed seconds: 0.56, Real time factor (RTF): 0.16 + 걔는 괜찮은 척하려구 애 쓰는 거 같았다 +{ "text": " 걔는 괜찮은 척하려구 애 쓰는 거 같았다", "tokens": [" 걔는", " 괜찮은", " 척", "하", "려", "구", " 애", " 쓰는", " 거", " 같", "았", "다"], "timestamps": [0.52, 0.96, 1.28, 1.44, 1.52, 1.84, 2.28, 2.48, 2.88, 3.04, 3.20, 3.44], "ys_probs": [-1.701665, -0.208116, -0.527190, -1.777411, -1.853504, -0.768175, -1.029222, -1.657714, -0.514807, -0.360788, -0.842238, -0.218511], "lm_probs": [], "context_scores": [], "segment": 0, "words": [], "start_time": 0.00, "is_final": false} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt new file mode 100644 index 000000000..da09a3e4f --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx", decoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx", joiner="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), tokens="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.5, Real time factor (RTF): 0.088 + 对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.32, 0.64, 0.76, 0.84, 1.04, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.88, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.72], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt new file mode 100644 index 000000000..62e0520e9 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx", decoder="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx", joiner="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), tokens="./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav +Elapsed seconds: 0.65, Real time factor (RTF): 0.12 + 对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false, "segment":0, "start_time":0.00, "text": " 对我做了介绍那么我想说的是大家如果对我的研究感兴趣", "timestamps": [0.32, 0.64, 0.76, 0.84, 1.08, 1.24, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.28, 3.40, 3.60, 3.72, 3.84, 3.96, 4.04, 4.16, 4.28, 4.36, 4.60, 4.72], "tokens":[" 对", "我", "做", "了", "介", "绍", "那", "么", "我", "想", "说", "的", "是", "大", "家", "如", "果", "对", "我", "的", "研", "究", "感", "兴", "趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.int8.txt new file mode 100644 index 000000000..09c6d783e --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.int8.txt @@ -0,0 +1,7 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.int8.onnx", decoder="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0) +./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav +Elapsed seconds: 0.69, Real time factor (RTF): 0.069 +昨天是 MONDAY TODAY IS THEY AFTER TOMORROW是星期三 +{ "text": "昨天是 MONDAY TODAY IS THEY AFTER TOMORROW是星期三", "tokens": [ "昨", "天", "是", " MO", "N", "DAY", " TO", "DAY", " IS", " THEY", " AFTER", " TO", "M", "OR", "ROW", "是", "星", "期", "三" ], "timestamps": [ 0.64, 1.08, 1.64, 2.08, 2.20, 2.36, 4.20, 4.36, 5.12, 7.16, 7.44, 8.00, 8.12, 8.20, 8.40, 9.04, 9.44, 9.64, 9.88 ], "ys_probs": [ -0.000305, -0.152557, -0.007835, -0.156221, -0.622139, -0.081843, -1.140152, -0.418322, -0.198410, -0.939461, -0.224989, -0.052963, -0.098366, -0.081665, -0.453255, -0.335670, -0.039482, -0.381765, -0.192475 ], "lm_probs": [ ], "context_scores": [ ], "segment": 0, "start_time": 0.00, "is_final": false} diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt new file mode 100644 index 000000000..d0ef8b613 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt @@ -0,0 +1,7 @@ +/project/sherpa-onnx/csrc/parse-options.cc:Read:361 sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.onnx", decoder="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), tokens="./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0) +./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav +Elapsed seconds: 1, Real time factor (RTF): 0.1 +昨天是 MONDAY TODAY IS THEY AFTER TOMORROW是星期三 +{ "text": "昨天是 MONDAY TODAY IS THEY AFTER TOMORROW是星期三", "tokens": [ "昨", "天", "是", " MO", "N", "DAY", " TO", "DAY", " IS", " THEY", " AFTER", " TO", "M", "OR", "ROW", "是", "星", "期", "三" ], "timestamps": [ 0.64, 1.08, 1.64, 2.08, 2.20, 2.36, 4.16, 4.36, 5.12, 7.16, 7.44, 8.00, 8.12, 8.20, 8.44, 9.08, 9.44, 9.64, 9.88 ], "ys_probs": [ -0.000507, -0.056152, -0.007374, -0.213242, -0.362640, -0.117561, -1.036179, -0.219900, -0.150360, -0.734749, -0.113281, -0.060974, -0.117775, -0.361603, -0.039993, -0.217766, -0.042011, -0.108857, -0.135108 ], "lm_probs": [ ], "context_scores": [ ], "segment": 0, "start_time": 0.00, "is_final": false} diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23-int8.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23-int8.txt new file mode 100644 index 000000000..7431b6dd1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23-int8.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.int8.onnx --decoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.int8.onnx ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.int8.onnx", decoder="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.int8.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), tokens="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, context_score=1.5, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav +Elapsed seconds: 0.16, Real time factor (RTF): 0.028 +对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false,"segment":0,"start_time":0.0,"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣","timestamps":"[0.32, 0.64, 0.76, 0.96, 1.08, 1.16, 1.92, 2.04, 2.24, 2.36, 2.56, 2.68, 2.76, 3.36, 3.52, 3.64, 3.72, 3.84, 3.92, 4.00, 4.08, 4.24, 4.48, 4.56, 4.72]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.txt b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.txt new file mode 100644 index 000000000..bd35cd40e --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.txt @@ -0,0 +1,8 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx --tokens=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt --encoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.onnx --decoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx --joiner=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.onnx ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav + +OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.onnx", decoder="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx", joiner="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.onnx"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), tokens="./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OnlineLMConfig(model="", scale=0.5), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), enable_endpoint=True, max_active_paths=4, context_score=1.5, decoding_method="greedy_search") +./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav +Elapsed seconds: 0.21, Real time factor (RTF): 0.038 +对我做了介绍那么我想说的是大家如果对我的研究感兴趣 +{"is_final":false,"segment":0,"start_time":0.0,"text":"对我做了介绍那么我想说的是大家如果对我的研究感兴趣","timestamps":"[0.32, 0.64, 0.76, 0.96, 1.08, 1.16, 1.96, 2.04, 2.24, 2.36, 2.56, 2.68, 2.80, 3.36, 3.52, 3.64, 3.72, 3.84, 3.92, 4.00, 4.08, 4.24, 4.48, 4.56, 4.72]","tokens":["对","我","做","了","介","绍","那","么","我","想","说","的","是","大","家","如","果","对","我","的","研","究","感","兴","趣"]} + diff --git a/docs/source/onnx/pretrained_models/online-transducer/conformer-transducer-models.rst b/docs/source/onnx/pretrained_models/online-transducer/conformer-transducer-models.rst new file mode 100644 index 000000000..d08c56181 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/conformer-transducer-models.rst @@ -0,0 +1,147 @@ +.. _sherpa_onnx_streaming_conformer_transducer_models: + +Conformer-transducer-based Models +================================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +csukuangfj/sherpa-onnx-streaming-conformer-zh-2023-05-23 (Chinese) +------------------------------------------------------------------ + +This model is converted from + +``_ + + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-conformer-zh-2023-05-23.tar.bz2 + + tar xvf sherpa-onnx-streaming-conformer-zh-2023-05-23.tar.bz2 + rm sherpa-onnx-streaming-conformer-zh-2023-05-23.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-conformer-zh-2023-05-23 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 11M May 23 14:44 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 12M May 23 14:44 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 160M May 23 14:46 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 345M May 23 14:47 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 2.7M May 23 14:44 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 11M May 23 14:44 joiner-epoch-99-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-conformer-zh-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-streaming-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-conformer-zh-2023-05-23/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-conformer-zh-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-streaming-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-conformer-zh-2023-05-23/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-conformer/sherpa-onnx-streaming-conformer-zh-2023-05-23-int8.txt + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-conformer-zh-2023-05-23/tokens.txt \ + --encoder=./sherpa-onnx-streaming-conformer-zh-2023-05-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-conformer-zh-2023-05-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-conformer-zh-2023-05-23/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. diff --git a/docs/source/onnx/pretrained_models/online-transducer/index.rst b/docs/source/onnx/pretrained_models/online-transducer/index.rst new file mode 100644 index 000000000..4c73cc011 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/index.rst @@ -0,0 +1,13 @@ +.. _onnx_online_transducer_models: + +Online transducer models +======================== + +This section lists available online transducer models. + +.. toctree:: + :maxdepth: 5 + + zipformer-transducer-models + conformer-transducer-models + lstm-transducer-models diff --git a/docs/source/onnx/pretrained_models/online-transducer/lstm-transducer-models.rst b/docs/source/onnx/pretrained_models/online-transducer/lstm-transducer-models.rst new file mode 100644 index 000000000..f2db3b45a --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/lstm-transducer-models.rst @@ -0,0 +1,257 @@ +LSTM-transducer-based Models +============================ + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +csukuangfj/sherpa-onnx-lstm-en-2023-02-17 (English) +--------------------------------------------------- + +This model trained using the `GigaSpeech`_ and the `LibriSpeech`_ dataset. + +Please see ``_ for how the model +is trained. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-lstm-en-2023-02-17.tar.bz2 + + tar xvf sherpa-onnx-lstm-en-2023-02-17.tar.bz2 + rm sherpa-onnx-lstm-en-2023-02-17.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-lstm-en-2023-02-17$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 1.3M Mar 31 22:41 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 2.0M Mar 31 22:41 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 80M Mar 31 22:41 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 319M Mar 31 22:41 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 254K Mar 31 22:41 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 1003K Mar 31 22:41 joiner-epoch-99-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-lstm-en-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-lstm-en-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-lstm-en-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-lstm-en-2023-02-17/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-lstm-en-2023-02-17/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-lstm/sherpa-onnx-lstm-en-2023-02-17.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-lstm-en-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-lstm-en-2023-02-17/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-lstm-en-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-lstm-en-2023-02-17/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-lstm-en-2023-02-17/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-lstm/sherpa-onnx-lstm-en-2023-02-17-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-lstm-en-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-lstm-en-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-lstm-en-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-lstm-en-2023-02-17/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +csukuangfj/sherpa-onnx-lstm-zh-2023-02-20 (Chinese) +--------------------------------------------------- + +This is a model trained using the `WenetSpeech`_ dataset. + +Please see ``_ for how the model +is trained. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-lstm-zh-2023-02-20.tar.bz2 + + tar xvf sherpa-onnx-lstm-zh-2023-02-20.tar.bz2 + rm sherpa-onnx-lstm-zh-2023-02-20.tar.bz2 + + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-lstm-zh-2023-02-20$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 12M Mar 31 20:55 decoder-epoch-11-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 12M Mar 31 20:55 decoder-epoch-11-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 80M Mar 31 20:55 encoder-epoch-11-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 319M Mar 31 20:55 encoder-epoch-11-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 2.8M Mar 31 20:55 joiner-epoch-11-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 11M Mar 31 20:55 joiner-epoch-11-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-lstm-zh-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-lstm-zh-2023-02-20/encoder-epoch-11-avg-1.onnx \ + --decoder=./sherpa-onnx-lstm-zh-2023-02-20/decoder-epoch-11-avg-1.onnx \ + --joiner=./sherpa-onnx-lstm-zh-2023-02-20/joiner-epoch-11-avg-1.onnx \ + ./sherpa-onnx-lstm-zh-2023-02-20/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-lstm/sherpa-onnx-lstm-zh-2023-02-20.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-lstm-zh-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-lstm-zh-2023-02-20/encoder-epoch-11-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-lstm-zh-2023-02-20/decoder-epoch-11-avg-1.onnx \ + --joiner=./sherpa-onnx-lstm-zh-2023-02-20/joiner-epoch-11-avg-1.int8.onnx \ + ./sherpa-onnx-lstm-zh-2023-02-20/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-lstm/sherpa-onnx-lstm-zh-2023-02-20-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-lstm-zh-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-lstm-zh-2023-02-20/encoder-epoch-11-avg-1.onnx \ + --decoder=./sherpa-onnx-lstm-zh-2023-02-20/decoder-epoch-11-avg-1.onnx \ + --joiner=./sherpa-onnx-lstm-zh-2023-02-20/joiner-epoch-11-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. diff --git a/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst b/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst new file mode 100644 index 000000000..c9708d780 --- /dev/null +++ b/docs/source/onnx/pretrained_models/online-transducer/zipformer-transducer-models.rst @@ -0,0 +1,1530 @@ +.. _sherpa_onnx_zipformer_transducer_models: + +Zipformer-transducer-based Models +================================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +sherpa-onnx-streaming-zipformer-korean-2024-06-16 (Korean) +---------------------------------------------------------- + +Training code for this model can be found at ``_. +It supports only Korean. + +PyTorch checkpoint with optimizer state can be found at +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2 + rm sherpa-onnx-streaming-zipformer-korean-2024-06-16.tar.bz2 + ls -lh sherpa-onnx-streaming-zipformer-korean-2024-06-16 + +The output is given below: + +.. code-block:: + + $ ls -lh sherpa-onnx-streaming-zipformer-korean-2024-06-16 + + total 907104 + -rw-r--r-- 1 fangjun staff 307K Jun 16 17:36 bpe.model + -rw-r--r-- 1 fangjun staff 2.7M Jun 16 17:36 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 11M Jun 16 17:36 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 121M Jun 16 17:36 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 279M Jun 16 17:36 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 2.5M Jun 16 17:36 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 9.8M Jun 16 17:36 joiner-epoch-99-avg-1.onnx + drwxr-xr-x 7 fangjun staff 224B Jun 16 17:36 test_wavs + -rw-r--r-- 1 fangjun staff 59K Jun 16 17:36 tokens.txt + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-korean-2024-06-16/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-korean-2024-06-16-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-korean-2024-06-16/joiner-epoch-99-avg-1.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12 (Chinese) +------------------------------------------------------------------ + +Training code for this model can be found at ``_. +It supports only Chinese. + +Please refer to ``_ +for the detailed information about the training data. In total, there are 14k hours of training data. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.tar.bz2 + + tar xf sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.tar.bz2 + rm sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.tar.bz2 + ls -lh sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12 + +The output is given below: + +.. code-block:: + + $ ls -lh sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12 + total 668864 + -rw-r--r-- 1 fangjun staff 28B Dec 12 18:59 README.md + -rw-r--r-- 1 fangjun staff 131B Dec 12 18:59 bpe.model + -rw-r--r-- 1 fangjun staff 1.2M Dec 12 18:59 decoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 4.9M Dec 12 18:59 decoder-epoch-20-avg-1-chunk-16-left-128.onnx + -rw-r--r-- 1 fangjun staff 67M Dec 12 18:59 encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 249M Dec 12 18:59 encoder-epoch-20-avg-1-chunk-16-left-128.onnx + -rw-r--r-- 1 fangjun staff 1.0M Dec 12 18:59 joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 3.9M Dec 12 18:59 joiner-epoch-20-avg-1-chunk-16-left-128.onnx + drwxr-xr-x 8 fangjun staff 256B Dec 12 18:59 test_wavs + -rw-r--r-- 1 fangjun staff 18K Dec 12 18:59 tokens.txt + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \ + ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +.. _sherpa-onnx-wenetspeech-small-streaming: + +k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small (Chinese) +------------------------------------------------------------------ + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +In the following, we describe how to download it. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + git lfs install + git clone https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small + + +.. _sherpa-onnx-wenetspeech-large-streaming: + +k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-large (Chinese) +------------------------------------------------------------------ + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +In the following, we describe how to download it. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + git lfs install + git clone https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-large + + +.. _sherpa-onnx-wenetspeech-2023-06-15-streaming: + +pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 (Chinese) +---------------------------------------------------------------------- + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +If you are interested in how the model is trained, please refer to +``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/icefall-asr-zipformer-streaming-wenetspeech-20230615.tar.bz2 + + tar xvf icefall-asr-zipformer-streaming-wenetspeech-20230615.tar.bz2 + rm icefall-asr-zipformer-streaming-wenetspeech-20230615.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + icefall-asr-zipformer-streaming-wenetspeech-20230615 fangjun$ ls -lh exp/*chunk-16-left-128.*onnx + -rw-r--r-- 1 fangjun staff 11M Jun 26 15:42 exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 12M Jun 26 15:42 exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx + -rw-r--r-- 1 fangjun staff 68M Jun 26 15:42 exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 250M Jun 26 15:43 exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx + -rw-r--r-- 1 fangjun staff 2.7M Jun 26 15:42 exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 fangjun staff 11M Jun 26 15:42 exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --decoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --joiner=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ + ./icefall-asr-zipformer-streaming-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx \ + --decoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --joiner=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx \ + ./icefall-asr-zipformer-streaming-wenetspeech-20230615/test_wavs/DEV_T0000000000.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/icefall-asr-zipformer-streaming-wenetspeech-20230615-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx \ + --decoder=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --joiner=./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +.. _sherpa-onnx-streaming-zipformer-en-2023-06-26-english: + +csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 (English) +------------------------------------------------------------------ + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +If you are interested in how the model is trained, please refer to +``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes below. + +.. code-block:: bash + + -rw-r--r-- 1 1001 127 240K Apr 23 06:45 bpe.model + -rw-r--r-- 1 1001 127 1.3M Apr 23 06:45 decoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 1001 127 2.0M Apr 23 06:45 decoder-epoch-99-avg-1-chunk-16-left-128.onnx + -rw-r--r-- 1 1001 127 68M Apr 23 06:45 encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 1001 127 250M Apr 23 06:45 encoder-epoch-99-avg-1-chunk-16-left-128.onnx + -rwxr-xr-x 1 1001 127 814 Apr 23 06:45 export-onnx-zipformer-online.sh + -rw-r--r-- 1 1001 127 254K Apr 23 06:45 joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx + -rw-r--r-- 1 1001 127 1003K Apr 23 06:45 joiner-epoch-99-avg-1-chunk-16-left-128.onnx + -rw-r--r-- 1 1001 127 216 Apr 23 06:45 README.md + drwxr-xr-x 2 1001 127 4.0K Apr 23 06:45 test_wavs + -rw-r--r-- 1 1001 127 5.0K Apr 23 06:45 tokens.txt + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-26-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-21 (English) +------------------------------------------------------------------ + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ and `GigaSpeech`_ corpus. + +If you are interested in how the model is trained, please refer to +``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-en-2023-06-21 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 1.2M Jun 21 15:34 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Jun 21 15:34 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 179M Jun 21 15:36 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 337M Jun 21 15:37 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Jun 21 15:34 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Jun 21 15:34 joiner-epoch-99-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-21/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-21/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-06-21-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21 (English) +------------------------------------------------------------------ + +This model is converted from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +You can find the training code at + +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. tabs:: + + .. tab:: GitHub + + .. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-02-21.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-en-2023-02-21.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-02-21.tar.bz2 + + .. tab:: ModelScope + + .. code-block:: bash + + cd /path/to/sherpa-onnx + + GIT_LFS_SKIP_SMUDGE=1 git clone https://www.modelscope.cn/pkufool/sherpa-onnx-streaming-zipformer-en-2023-02-21.git + cd sherpa-onnx-streaming-zipformer-en-2023-02-21 + git lfs pull --include "*.onnx" + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-en-2023-02-21$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 1.3M Mar 31 23:06 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 2.0M Feb 21 20:51 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 180M Mar 31 23:07 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 338M Feb 21 20:51 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 254K Mar 31 23:06 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 1003K Feb 21 20:51 joiner-epoch-99-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-2023-02-21-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +.. _sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20: + +csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 (Bilingual, Chinese + English) +---------------------------------------------------------------------------------------------------- + +This model is converted from + +``_ + +which supports both Chinese and English. The model is contributed by the community +and is trained on tens of thousands of some internal dataset. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20$ ls -lh *.onnx + -rw-r--r-- 1 kuangfangjun root 13M Mar 31 21:11 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 14M Feb 20 20:13 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 174M Mar 31 21:11 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 315M Feb 20 20:13 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 kuangfangjun root 3.1M Mar 31 21:11 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 kuangfangjun root 13M Feb 20 20:13 joiner-epoch-99-avg-1.onnx + + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.txt + +int8 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + + +.. _sherpa_onnx_streaming_zipformer_fr_2023_04_14: + +shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14 (French) +---------------------------------------------------------------- + +This model is converted from + +``_ + +which supports only French as it is trained on the `CommonVoice`_ corpus. +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-fr-2023-04-14.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-fr-2023-04-14.tar.bz2 + rm sherpa-onnx-streaming-zipformer-fr-2023-04-14.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-fr-2023-04-14 shaojieli$ ls -lh *.bin + + -rw-r--r-- 1 lishaojie Students 1.3M 4月 14 14:09 decoder-epoch-29-avg-9-with-averaged-model.int8.onnx + -rw-r--r-- 1 lishaojie Students 2.0M 4月 14 14:09 decoder-epoch-29-avg-9-with-averaged-model.onnx + -rw-r--r-- 1 lishaojie Students 121M 4月 14 14:09 encoder-epoch-29-avg-9-with-averaged-model.int8.onnx + -rw-r--r-- 1 lishaojie Students 279M 4月 14 14:09 encoder-epoch-29-avg-9-with-averaged-model.onnx + -rw-r--r-- 1 lishaojie Students 254K 4月 14 14:09 joiner-epoch-29-avg-9-with-averaged-model.int8.onnx + -rw-r--r-- 1 lishaojie Students 1003K 4月 14 14:09 joiner-epoch-29-avg-9-with-averaged-model.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/encoder-epoch-29-avg-9-with-averaged-model.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/decoder-epoch-29-avg-9-with-averaged-model.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/joiner-epoch-29-avg-9-with-averaged-model.onnx \ + ./sherpa-onnx-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14.txt + +int8 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/encoder-epoch-29-avg-9-with-averaged-model.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/decoder-epoch-29-avg-9-with-averaged-model.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/joiner-epoch-29-avg-9-with-averaged-model.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-fr-2023-04-14/test_wavs/common_voice_fr_19364697.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-fr-2023-04-14-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/encoder-epoch-29-avg-9-with-averaged-model.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/decoder-epoch-29-avg-9-with-averaged-model.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-fr-2023-04-14/joiner-epoch-29-avg-9-with-averaged-model.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + +.. _sherpa_onnx_streaming_zipformer_small_bilingual_zh_en_2023_02_16: + +sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16 (Bilingual, Chinese + English) +----------------------------------------------------------------------------------------------- + +.. hint:: + + It is a small model. + +This model is converted from + +``_ + +which supports both Chinese and English. The model is contributed by the community +and is trained on tens of thousands of some internal dataset. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2 + + tar xf sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2 + rm sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16 fangjun$ ls -lh *.onnx + total 158M + drwxr-xr-x 2 1001 127 4.0K Mar 20 13:11 64 + drwxr-xr-x 2 1001 127 4.0K Mar 20 13:11 96 + -rw-r--r-- 1 1001 127 240K Mar 20 13:11 bpe.model + -rw-r--r-- 1 1001 127 3.4M Mar 20 13:11 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 1001 127 14M Mar 20 13:11 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 1001 127 41M Mar 20 13:11 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 1001 127 85M Mar 20 13:11 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 1001 127 3.1M Mar 20 13:11 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 1001 127 13M Mar 20 13:11 joiner-epoch-99-avg-1.onnx + drwxr-xr-x 2 1001 127 4.0K Mar 20 13:11 test_wavs + -rw-r--r-- 1 1001 127 55K Mar 20 13:11 tokens.txt + +.. hint:: + + There are two sub-folders in the model directory: ``64`` and ``96``. + The number represents chunk size. The larger the number, the lower the RTF. + The default chunk size is 32. + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner-epoch-99-avg-1.int8.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + +.. _sherpa_onnx_streaming_zipformer_zh_14M_2023_02_23: + +csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23 (Chinese) +---------------------------------------------------------------------- + +.. hint:: + + It is a small model. + +This model is from + +``_ + +which supports only Chinese as it is trained on the `WenetSpeech`_ corpus. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.tar.bz2 + rm sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 1.8M Sep 10 15:31 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 7.2M Sep 10 15:31 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 21M Sep 10 15:31 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 39M Sep 10 15:31 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 1.7M Sep 10 15:31 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 6.8M Sep 10 15:31 joiner-epoch-99-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. + + +.. _sherpa_onnx_streaming_zipformer_en_20M_2023_02_17: + +csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17 (English) +----------------------------------------------------------------------- + +.. hint:: + + It is a small model. + +This model is from + +``_ + +which supports only English as it is trained on the `LibriSpeech`_ corpus. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 + + tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + sherpa-onnx-streaming-zipformer-en-20M-2023-02-17 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 527K Sep 10 17:06 decoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 2.0M Sep 10 17:06 decoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 41M Sep 10 17:06 encoder-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 85M Sep 10 17:06 encoder-epoch-99-avg-1.onnx + -rw-r--r-- 1 fangjun staff 253K Sep 10 17:06 joiner-epoch-99-avg-1.int8.onnx + -rw-r--r-- 1 fangjun staff 1.0M Sep 10 17:06 joiner-epoch-99-avg-1.onnx + +Decode a single wave file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +fp32 +^^^^ + +The following code shows how to use ``fp32`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.txt + +int8 +^^^^ + +The following code shows how to use ``int8`` models to decode a wave file: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx \ + --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.int8.onnx \ + ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx.exe`` for Windows. + +You should see the following output: + +.. literalinclude:: ./code-zipformer/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17-int8.txt + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-microphone \ + --tokens=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx + +.. hint:: + + If your system is Linux (including embedded Linux), you can also use + :ref:`sherpa-onnx-alsa` to do real-time speech recognition with your + microphone if ``sherpa-onnx-microphone`` does not work for you. diff --git a/docs/source/onnx/pretrained_models/small-online-models.rst b/docs/source/onnx/pretrained_models/small-online-models.rst new file mode 100644 index 000000000..1f3792a74 --- /dev/null +++ b/docs/source/onnx/pretrained_models/small-online-models.rst @@ -0,0 +1,24 @@ +Small models +============ + +In this section, we list online/streaming models with fewer parameters +that are suitable for resource constrained embedded systems. + +.. hint:: + + You can use them as a first pass model in a two-pass system, where + the second pass uses a non-streaming model. + +.. hint:: + + If you are using Raspberry Pi 4, this section is not so helpful for you + since all models in `sherpa-onnx`_ are able to run in real-time on it. + + This page is especially useful for systems with less resource than + Raspberry Pi 4. + + +- :ref:`sherpa_onnx_streaming_zipformer_zh_14M_2023_02_23` +- :ref:`sherpa_onnx_streaming_zipformer_en_20M_2023_02_17` +- :ref:`sherpa_onnx_streaming_zipformer_small_bilingual_zh_en_2023_02_16` + diff --git a/docs/source/onnx/pretrained_models/telespeech/code/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.txt b/docs/source/onnx/pretrained_models/telespeech/code/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.txt new file mode 100644 index 000000000..d7054be37 --- /dev/null +++ b/docs/source/onnx/pretrained_models/telespeech/code/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./build/bin/sherpa-onnx-offline --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt --model-type=telespeech_ctc --num-threads=1 ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx", tokens="./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="telespeech_ctc", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0) +Creating recognizer ... +Started +Done! + +./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav +{"text": "自己就是在那个在那个就是在情界里面就是感觉演得特别好就是好像很真实样知道吧", "timestamps": [0.08, 0.36, 0.52, 0.72, 0.92, 1.16, 1.36, 1.88, 2.20, 2.36, 3.16, 3.28, 3.40, 3.60, 3.80, 3.92, 4.08, 4.24, 4.40, 4.56, 4.76, 5.16, 5.32, 5.44, 5.64, 5.76, 5.88, 6.04, 6.16, 6.28, 6.40, 6.60, 6.88, 7.12, 7.40, 7.52, 7.64], "tokens":["自", "己", "就", "是", "在", "那", "个", "在", "那", "个", "就", "是", "在", "情", "界", "里", "面", "就", "是", "感", "觉", "演", "得", "特", "别", "好", "就", "是", "好", "像", "很", "真", "实", "样", "知", "道", "吧"]} +---- +./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav +{"text": "他就每个人手法就这意思法律意识太单薄了而且就是也不顾及到别人的感受", "timestamps": [0.36, 0.56, 1.04, 1.16, 1.24, 1.64, 1.88, 2.24, 2.40, 2.60, 2.80, 3.12, 3.32, 3.64, 3.80, 3.96, 4.16, 4.44, 4.68, 4.80, 5.00, 5.16, 5.28, 6.12, 6.28, 6.44, 6.60, 6.72, 6.88, 7.04, 7.12, 7.32, 7.52], "tokens":["他", "就", "每", "个", "人", "手", "法", "就", "这", "意", "思", "法", "律", "意", "识", "太", "单", "薄", "了", "而", "且", "就", "是", "也", "不", "顾", "及", "到", "别", "人", "的", "感", "受"]} +---- +./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav +{"text": "他这个管一都通到有时候都通到七八层楼高然它这个管箱就可交到那个那珠子上", "timestamps": [0.04, 0.12, 0.24, 0.40, 1.00, 1.24, 1.44, 1.68, 2.32, 2.48, 2.60, 2.64, 2.80, 3.00, 3.16, 3.32, 3.52, 3.68, 3.92, 5.00, 5.16, 5.28, 5.32, 5.44, 5.84, 6.00, 6.12, 6.48, 6.68, 6.84, 7.00, 7.16, 7.32, 7.56, 7.68], "tokens":["他", "这", "个", "管", "一", "都", "通", "到", "有", "时", "候", "都", "通", "到", "七", "八", "层", "楼", "高", "然", "它", "这", "个", "管", "箱", "就", "可", "交", "到", "那", "个", "那", "珠", "子", "上"]} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 3.406 s +Real time factor (RTF): 3.406 / 23.634 = 0.144 diff --git a/docs/source/onnx/pretrained_models/telespeech/how-to-export.rst b/docs/source/onnx/pretrained_models/telespeech/how-to-export.rst new file mode 100644 index 000000000..c0eea4801 --- /dev/null +++ b/docs/source/onnx/pretrained_models/telespeech/how-to-export.rst @@ -0,0 +1,26 @@ +How to export models from Tele-AI/TeleSpeech-ASR to sherpa-onnx +=============================================================== + +This section describes how to export CTC models from `Tele-AI/TeleSpeech-ASR` to `sherpa-onnx`_. + +Step 1: Export model.onnx +------------------------- + +The first step is to obtain ``model.onnx``. + +Please see ``_ +for details. + +Step 2: Add metadata +-------------------- + +To be usable in `sherpa-onnx`_, we have to use `add-metadata.py `_ to add metadata to ``model.onnx``. + +Please see ``_ +for details. + + +Step 3: Obtain tokens.txt +------------------------- + +Please also see ``_ diff --git a/docs/source/onnx/pretrained_models/telespeech/index.rst b/docs/source/onnx/pretrained_models/telespeech/index.rst new file mode 100644 index 000000000..b23c36dde --- /dev/null +++ b/docs/source/onnx/pretrained_models/telespeech/index.rst @@ -0,0 +1,17 @@ +TeleSpeech +========== + +This page lists all offline CTC models from ``_. + +.. hint:: + + Please see the license at `TeleSpeech模型社区许可协议.pdf `_ + + Models from ``_ are subject to the above license + if you want to use them for commercial purpose. + +.. toctree:: + :maxdepth: 5 + + how-to-export + models diff --git a/docs/source/onnx/pretrained_models/telespeech/models.rst b/docs/source/onnx/pretrained_models/telespeech/models.rst new file mode 100644 index 000000000..f43e78a4c --- /dev/null +++ b/docs/source/onnx/pretrained_models/telespeech/models.rst @@ -0,0 +1,75 @@ +Models +====== + +sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 (支持非常多种方言) +------------------------------------------------------------------------------------- + +.. hint:: + + 这个模型支持很多种方言。 + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + $ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 325M Jun 4 11:56 model.int8.onnx + +Decode wave files +~~~~~~~~~~~~~~~~~ + +.. hint:: + + It supports decoding only wave files of a single channel with 16-bit + encoded samples, while the sampling rate does not need to be 16 kHz. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ + --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ + --model-type=telespeech_ctc \ + --num-threads=1 \ + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav \ + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav \ + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav + +.. note:: + + Please use ``./build/bin/Release/sherpa-onnx-offline.exe`` for Windows. + +.. caution:: + + If you use Windows and get encoding issues, please run: + + .. code-block:: bash + + CHCP 65001 + + in your commandline. + +You should see the following output: + +.. literalinclude:: ./code/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.txt + +.. note:: + + The ``feature_dim=80`` is incorrect in the above logs. The actual value is 40. + +.. hint:: + + There is also a float32 model. Please see ``_ diff --git a/docs/source/onnx/pretrained_models/wenet/all-models.rst b/docs/source/onnx/pretrained_models/wenet/all-models.rst new file mode 100644 index 000000000..33cd7b8e8 --- /dev/null +++ b/docs/source/onnx/pretrained_models/wenet/all-models.rst @@ -0,0 +1,36 @@ +All models from WeNet +===================== + +``_ +lists all pre-trained models from `WeNet`_ and we have converted all of them +to `sherpa-onnx`_ using the following script: + + ``_. + +We have uploaded the exported models to huggingface and you can find them from +the following figure: + + .. figure:: ./pic/wenet-models-onnx-list.jpg + :alt: All pretrained models from `WeNet` + :width: 600 + + All pre-trained models from `WeNet`_. + +To make it easier to copy the links, we list them below: + + - ``_ + - ``_ + - ``_ + - ``_ + - ``_ + - ``_ + +Colab +----- + +We provide a colab notebook +|Sherpa-onnx wenet ctc colab notebook| +for you to try the exported `WeNet`_ models with `sherpa-onnx`_. + +.. |Sherpa-onnx wenet ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_with_models_from_wenet.ipynb diff --git a/docs/source/onnx/pretrained_models/wenet/how-to-export.rst b/docs/source/onnx/pretrained_models/wenet/how-to-export.rst new file mode 100644 index 000000000..0591545ba --- /dev/null +++ b/docs/source/onnx/pretrained_models/wenet/how-to-export.rst @@ -0,0 +1,99 @@ +How to export models from WeNet to sherpa-onnx +============================================== + +Suppose you have the following files from `WeNet`_: + + - ``final.pt`` + - ``train.yaml`` + - ``global_cmvn`` + - ``units.txt`` + +We describe below how to use scripts from `sherpa-onnx`_ to export your files. + +.. hint:: + + Both streaming and non-streaming models are supported. + +Export for non-streaming inference +---------------------------------- + +You can use the following script + + ``_ + +to export your model to `sherpa-onnx`_. After running it, you should get two files: + + - ``model.onnx`` + - ``model.int8.onnx``. + +Next, we rename ``units.txt`` to ``tokens.txt`` to follow the convention used in `sherpa-onnx`_: + +.. code-block:: bash + + mv units.txt tokens.txt + +Now you can use the following command for speech recognition with the exported models: + +.. code-block:: bash + + # with float32 models + ./build/bin/sherpa-onnx-offline \ + --wenet-ctc-model=./model.onnx + --tokens=./tokens.txt \ + /path/to/some.wav + + # with int8 models + ./build/bin/sherpa-onnx-offline \ + --wenet-ctc-model=./model.int8.onnx + --tokens=./tokens.txt \ + /path/to/some.wav + +Export for streaming inference +------------------------------ + +You can use the following script + + ``_ + +to export your model to `sherpa-onnx`_. After running it, you should get two files: + + - ``model-streaming.onnx`` + - ``model-streaming.int8.onnx``. + +Next, we rename ``units.txt`` to ``tokens.txt`` to follow the convention used in `sherpa-onnx`_: + +.. code-block:: bash + + mv units.txt tokens.txt + +Now you can use the following command for speech recognition with the exported models: + +.. code-block:: bash + + # with float32 models + ./build/bin/sherpa-onnx \ + --wenet-ctc-model=./model-streaming.onnx + --tokens=./tokens.txt \ + /path/to/some.wav + + # with int8 models + ./build/bin/sherpa-onnx \ + --wenet-ctc-model=./model-streaming.int8.onnx + --tokens=./tokens.txt \ + /path/to/some.wav + +FAQs +---- + +sherpa-onnx/csrc/online-wenet-ctc-model.cc:Init:144 head does not exist in the metadata +--------------------------------------------------------------------------------------- + +.. code-block:: + + /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/online-wenet-ctc-model.cc:Init:144 head does not exist in the metadata + +To fix the above error, please check the following two items: + + - Make sure you are using ``model-streaming.onnx`` or ``model-streaing.int8.onnx``. The executable + you are running requires a streaming model as input. + - Make sure you use the script from `sherpa-onnx`_ to export your model. diff --git a/docs/source/onnx/pretrained_models/wenet/index.rst b/docs/source/onnx/pretrained_models/wenet/index.rst new file mode 100644 index 000000000..49e18edb1 --- /dev/null +++ b/docs/source/onnx/pretrained_models/wenet/index.rst @@ -0,0 +1,12 @@ +WeNet +===== + +This page lists all CTC models from `WeNet`_. + + +.. toctree:: + :maxdepth: 5 + + how-to-export + all-models + diff --git a/docs/source/onnx/pretrained_models/wenet/pic/wenet-models-onnx-list.jpg b/docs/source/onnx/pretrained_models/wenet/pic/wenet-models-onnx-list.jpg new file mode 100644 index 000000000..dc1eb1a1d Binary files /dev/null and b/docs/source/onnx/pretrained_models/wenet/pic/wenet-models-onnx-list.jpg differ diff --git a/docs/source/onnx/pretrained_models/whisper/colab.rst b/docs/source/onnx/pretrained_models/whisper/colab.rst new file mode 100644 index 000000000..69c118e5d --- /dev/null +++ b/docs/source/onnx/pretrained_models/whisper/colab.rst @@ -0,0 +1,30 @@ +colab +===== + +Non-large models +----------------- + +We provide a colab notebook +|Sherpa-onnx offline recognition with whisper python api colab notebook| +for you to try `Whisper`_ models with `sherpa-onnx`_ step by step. + +.. |Sherpa-onnx offline recognition with whisper python api colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_whisper_models.ipynb + +.. image:: ./pic/whisper-colab.png + :alt: screenshot of using whisper with sherpa-onnx in colab + :width: 600 + +Large models +------------ + +For ``large`` models of whisper, please see the following colab notebook +|sherpa-onnx with whisper large-v3 colab notebook|. +It walks you step by step to try the exported large-v3 onnx model with `sherpa-onnx`_ +on CPU as well as on GPU. + +You will find the RTF on GPU (Tesla T4) is less than 1. + +.. |sherpa-onnx with whisper large-v3 colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_whisper_large_v3.ipynb + diff --git a/docs/source/onnx/pretrained_models/whisper/export-onnx.rst b/docs/source/onnx/pretrained_models/whisper/export-onnx.rst new file mode 100644 index 000000000..628418047 --- /dev/null +++ b/docs/source/onnx/pretrained_models/whisper/export-onnx.rst @@ -0,0 +1,224 @@ +Export Whisper to ONNX +====================== + +This section describes how to export `Whisper`_ models to `onnx`_. + + +Available models +---------------- + +Note that we have already exported `Whisper`_ models to `onnx`_ and they are available +from the following huggingface repositories: + +.. list-table:: + + * - Model type + - Huggingface repo + - 国内用户 + * - ``tiny.en`` + - ``_ + - `点此 `_ + * - ``base.en`` + - ``_ + - `点此 `_ + * - ``small.en`` + - ``_ + - `点此 `_ + * - ``distil-small.en`` + - ``_ + - `点此 `_ + * - ``medium.en`` + - ``_ + - `点此 `_ + * - ``distil-medium.en`` + - ``_ + - `点此 `_ + * - ``tiny`` + - ``_ + - `点此 `_ + * - ``base`` + - ``_ + - `点此 `_ + * - ``small`` + - ``_ + - `点此 `_ + * - ``medium`` + - ``_ + - `点此 `_ + * - ``large`` + - ``_ + - `点此 `_ + * - ``large-v1`` + - ``_ + - `点此 `_ + * - ``large-v2`` + - ``_ + - `点此 `_ + * - ``large-v3`` + - ``_ + - `点此 `_ + * - ``turbo`` + - ``_ + - `点此 `_ + * - ``distil-large-v2`` + - ``_ + - `点此 `_ + * - ``medium-aishell`` + - ``_ + - `点此 `_ + +.. note:: + + You can also download them from + + ``_ + + Models end with ``.en`` support only English and all + other models are multilingual. + + +If you want to export the models by yourself or/and want to learn how the models +are exported, please read below. + +Export to onnx +-------------- + +We use + + ``_ + +to export `Whisper`_ models to `onnx`_. + +First, let us install dependencies and download the export script + +.. code-block:: bash + + pip install torch openai-whisper onnxruntime onnx librosa soundfile + + git clone https://github.com/k2-fsa/sherpa-onnx/ + cd sherpa-onnx/scripts/whisper + python3 ./export-onnx.py --help + +It will print the following message: + +.. code-block:: bash + + usage: export-onnx.py [-h] --model + {tiny,tiny.en,base,base.en,small,small.en,medium,medium.en,large,large-v1,large-v2,large-v3,distil-medium.en,distil-small.en,distil-large-v2,medium-aishell} + + optional arguments: + -h, --help show this help message and exit + --model {tiny,tiny.en,base,base.en,small,small.en,medium,medium.en,large,large-v1,large-v2,large-v3,distil-medium.en,distil-small.en,distil-large-v2,medium-aishell} + + +Example 1: Export tiny.en +^^^^^^^^^^^^^^^^^^^^^^^^^ + +To export ``tiny.en``, we can use: + +.. code-block:: bash + + python3 ./export-onnx.py --model tiny.en + +It will generate the following files: + +.. code-block:: bash + + (py38) fangjuns-MacBook-Pro:whisper fangjun$ ls -lh tiny.en-* + -rw-r--r-- 1 fangjun staff 105M Aug 7 15:43 tiny.en-decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 185M Aug 7 15:43 tiny.en-decoder.onnx + -rw-r--r-- 1 fangjun staff 12M Aug 7 15:43 tiny.en-encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 36M Aug 7 15:43 tiny.en-encoder.onnx + -rw-r--r-- 1 fangjun staff 816K Aug 7 15:43 tiny.en-tokens.txt + +``tiny.en-encoder.onnx`` is the encoder model and ``tiny.en-decoder.onnx`` is the +decoder model. + +``tiny.en-encoder.int8.onnx`` is the quantized encoder model and ``tiny.en-decoder.onnx`` is the +quantized decoder model. + +``tiny.en-tokens.txt`` contains the token table, which maps an integer to a token and vice versa. + +To check whether the exported model works correctly, we can use + ``_ + +We use ``_ +as the test wave. + +.. code-block:: bash + + pip install kaldi-native-fbank + wget https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/main/test_wavs/0.wav + + python3 ./test.py \ + --encoder ./tiny.en-encoder.onnx \ + --decoder ./tiny.en-decoder.onnx \ + --tokens ./tiny.en-tokens.txt \ + ./0.wav + + +To test ``int8`` quantized models, we can use: + +.. code-block:: bash + + python3 ./test.py \ + --encoder ./tiny.en-encoder.int8.onnx \ + --decoder ./tiny.en-decoder.int8.onnx \ + --tokens ./tiny.en-tokens.txt \ + ./0.wav + +Example 2: Export large-v3 +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To export ``large-v3``, we can use: + +.. code-block:: bash + + python3 ./export-onnx.py --model large-v3 + +It will generate the following files: + +.. code-block:: bash + + (py38) fangjuns-MacBook-Pro:whisper fangjun$ ls -lh large-v3-* + -rw-r--r-- 1 fangjun staff 2.7M Jul 12 20:38 large-v3-decoder.onnx + -rw-r--r-- 1 fangjun staff 3.0G Jul 12 20:38 large-v3-decoder.weights + -rw-r--r-- 1 fangjun staff 744K Jul 12 20:35 large-v3-encoder.onnx + -rw-r--r-- 1 fangjun staff 2.8G Jul 12 20:35 large-v3-encoder.weights + -rw-r--r-- 1 fangjun staff 798K Jul 12 20:32 large-v3-tokens.txt + +``large-v3-encoder.onnx`` is the encoder model and ``large-v3-decoder.onnx`` is the +decoder model. + +Note that for ``large`` models, there will also be two additional ``weights`` files. + +``large-v3-tokens.txt`` contains the token table, which maps an integer to a token and vice versa. + +To check whether the exported model works correctly, we can use + ``_ + +We use ``_ +as the test wave. + +.. code-block:: bash + + pip install kaldi-native-fbank + wget https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/main/test_wavs/0.wav + + python3 ./test.py \ + --encoder ./large-v3-encoder.onnx \ + --decoder ./large-v3-decoder.onnx \ + --tokens ./large-v3-tokens.txt \ + ./0.wav + +.. hint:: + + We provide a colab notebook + |sherpa-onnx with whisper large-v3 colab notebook| + for you to try the exported large-v3 onnx model with sherpa-onnx + on CPU as well as on GPU. + + You will find the RTF on GPU (Tesla T4) is less than 1. + +.. |sherpa-onnx with whisper large-v3 colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_whisper_large_v3.ipynb diff --git a/docs/source/onnx/pretrained_models/whisper/huggingface.rst b/docs/source/onnx/pretrained_models/whisper/huggingface.rst new file mode 100644 index 000000000..89eef2458 --- /dev/null +++ b/docs/source/onnx/pretrained_models/whisper/huggingface.rst @@ -0,0 +1,13 @@ +Huggingface space +================= + +You can try `Whisper`_ models from within your browser without installing anything. + +Please visit + + ``_ + + +.. image:: ./pic/whisper-hf.png + :alt: screenshot of using whisper with sherpa-onnx + :width: 600 diff --git a/docs/source/onnx/pretrained_models/whisper/index.rst b/docs/source/onnx/pretrained_models/whisper/index.rst new file mode 100644 index 000000000..95afc5b4a --- /dev/null +++ b/docs/source/onnx/pretrained_models/whisper/index.rst @@ -0,0 +1,14 @@ +Whisper +======= + +This section describes how to use models from `Whisper`_ with `sherpa-onnx`_ +for non-streaming speech recognition. + +.. toctree:: + :maxdepth: 5 + + ./export-onnx.rst + ./tiny.en.rst + ./large-v3.rst + ./colab.rst + ./huggingface.rst diff --git a/docs/source/onnx/pretrained_models/whisper/large-v3.rst b/docs/source/onnx/pretrained_models/whisper/large-v3.rst new file mode 100644 index 000000000..779619601 --- /dev/null +++ b/docs/source/onnx/pretrained_models/whisper/large-v3.rst @@ -0,0 +1,280 @@ +.. _whisper_large_v3_sherpa_onnx: + +large-v3 +======== + +Before we start, let us +follow :ref:`install_sherpa_onnx_on_linux` +to build a CUDA-enabled version of `sherpa-onnx`_. + +In the following, we assume you have run + +.. code-block:: bash + + cd /content + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + mkdir -p build + cd build + cmake \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_GPU=ON .. + + make -j2 sherpa-onnx-offline + +You can use the following commands to download the exported `onnx`_ models of ``large-v3``: + +.. hint:: + + Please replace ``large-v3`` with + ``large``, ``large-v1``, ``large-v2``, and ``distil-large-v2`` + if you want to try a different type of model. + +.. code-block:: bash + + cd /content + + git lfs install + git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-large-v3 + + ls -lh sherpa-onnx-whisper-large-v3 + +The logs of the above commands are given below: + +.. code-block:: + + Git LFS initialized. + Cloning into 'sherpa-onnx-whisper-large-v3'... + remote: Enumerating objects: 26, done. + remote: Counting objects: 100% (22/22), done. + remote: Compressing objects: 100% (21/21), done. + remote: Total 26 (delta 2), reused 0 (delta 0), pack-reused 4 (from 1) + Unpacking objects: 100% (26/26), 1.00 MiB | 9.10 MiB/s, done. + Filtering content: 100% (6/6), 7.40 GiB | 34.50 MiB/s, done. + total 7.5G + -rw-r--r-- 1 root root 962M Jul 13 14:19 large-v3-decoder.int8.onnx + -rw-r--r-- 1 root root 2.8M Jul 13 14:18 large-v3-decoder.onnx + -rw-r--r-- 1 root root 3.0G Jul 13 14:22 large-v3-decoder.weights + -rw-r--r-- 1 root root 732M Jul 13 14:19 large-v3-encoder.int8.onnx + -rw-r--r-- 1 root root 745K Jul 13 14:18 large-v3-encoder.onnx + -rw-r--r-- 1 root root 2.8G Jul 13 14:21 large-v3-encoder.weights + -rw-r--r-- 1 root root 798K Jul 13 14:18 large-v3-tokens.txt + drwxr-xr-x 2 root root 4.0K Jul 13 14:18 test_wavs + +.. caution:: + + Please remember to run ``git lfs install`` before you run ``git clone``. + If you have any issues about ``git lfs install``, please follow + ``_ to install ``git-lfs``. + +.. caution:: + + Please check the file sizes are correct before proceeding. Otherwise, you would be ``SAD`` later. + +.. caution:: + + Please check the file sizes are correct before proceeding. Otherwise, you would be ``SAD`` later. + +.. caution:: + + Please check the file sizes are correct before proceeding. Otherwise, you would be ``SAD`` later. + +Run with CPU (float32) +---------------------- + +.. code-block:: bash + + cd /content + + exe=$PWD/sherpa-onnx/build/bin/sherpa-onnx-offline + + cd sherpa-onnx-whisper-large-v3 + + time $exe \ + --whisper-encoder=./large-v3-encoder.onnx \ + --whisper-decoder=./large-v3-decoder.onnx \ + --tokens=./large-v3-tokens.txt \ + --num-threads=2 \ + ./test_wavs/0.wav + +The logs are given below:: + + /content/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 /content/sherpa-onnx/build/bin/sherpa-onnx-offline --whisper-encoder=./large-v3-encoder.onnx --whisper-decoder=./large-v3-decoder.onnx --tokens=./large-v3-tokens.txt --num-threads=2 ./test_wavs/0.wav + + OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="./large-v3-encoder.onnx", decoder="./large-v3-decoder.onnx", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./large-v3-tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") + Creating recognizer ... + Started + Done! + + ./test_wavs/0.wav + {"text": " after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels", "timestamps": [], "tokens":[" after", " early", " night", "fall", " the", " yellow", " lamps", " would", " light", " up", " here", " and", " there", " the", " squ", "alid", " quarter", " of", " the", " broth", "els"], "words": []} + ---- + num threads: 2 + decoding method: greedy_search + Elapsed seconds: 54.070 s + Real time factor (RTF): 54.070 / 6.625 = 8.162 + + real 1m32.107s + user 1m39.877s + sys 0m10.405s + +Run with CPU (int8) +------------------- + +.. code-block:: bash + + cd /content + + exe=$PWD/sherpa-onnx/build/bin/sherpa-onnx-offline + + cd sherpa-onnx-whisper-large-v3 + + time $exe \ + --whisper-encoder=./large-v3-encoder.int8.onnx \ + --whisper-decoder=./large-v3-decoder.int8.onnx \ + --tokens=./large-v3-tokens.txt \ + --num-threads=2 \ + ./test_wavs/0.wav + +The logs are given below:: + + /content/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 /content/sherpa-onnx/build/bin/sherpa-onnx-offline --whisper-encoder=./large-v3-encoder.int8.onnx --whisper-decoder=./large-v3-decoder.int8.onnx --tokens=./large-v3-tokens.txt --num-threads=2 ./test_wavs/0.wav + + OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="./large-v3-encoder.int8.onnx", decoder="./large-v3-decoder.int8.onnx", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./large-v3-tokens.txt", num_threads=2, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") + Creating recognizer ... + Started + Done! + + ./test_wavs/0.wav + {"text": " after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels", "timestamps": [], "tokens":[" after", " early", " night", "fall", " the", " yellow", " lamps", " would", " light", " up", " here", " and", " there", " the", " squ", "alid", " quarter", " of", " the", " broth", "els"], "words": []} + ---- + num threads: 2 + decoding method: greedy_search + Elapsed seconds: 49.991 s + Real time factor (RTF): 49.991 / 6.625 = 7.546 + + real 1m15.555s + user 1m41.488s + sys 0m9.156s + + +Run with GPU (float32) +---------------------- + +.. code-block:: bash + + cd /content + exe=$PWD/sherpa-onnx/build/bin/sherpa-onnx-offline + + cd sherpa-onnx-whisper-large-v3 + + time $exe \ + --whisper-encoder=./large-v3-encoder.onnx \ + --whisper-decoder=./large-v3-decoder.onnx \ + --tokens=./large-v3-tokens.txt \ + --provider=cuda \ + --num-threads=2 \ + ./test_wavs/0.wav + +The logs are given below:: + + /content/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 /content/sherpa-onnx/build/bin/sherpa-onnx-offline --whisper-encoder=./large-v3-encoder.onnx --whisper-decoder=./large-v3-decoder.onnx --tokens=./large-v3-tokens.txt --provider=cuda --num-threads=2 ./test_wavs/0.wav + + OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="./large-v3-encoder.onnx", decoder="./large-v3-decoder.onnx", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./large-v3-tokens.txt", num_threads=2, debug=False, provider="cuda", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") + Creating recognizer ... + Started + Done! + + ./test_wavs/0.wav + {"text": " after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels", "timestamps": [], "tokens":[" after", " early", " night", "fall", " the", " yellow", " lamps", " would", " light", " up", " here", " and", " there", " the", " squ", "alid", " quarter", " of", " the", " broth", "els"], "words": []} + ---- + num threads: 2 + decoding method: greedy_search + Elapsed seconds: 5.910 s + Real time factor (RTF): 5.910 / 6.625 = 0.892 + + real 0m26.996s + user 0m12.854s + sys 0m4.486s + +.. note:: + + The above command is run within a colab notebook using Tesla T4 GPU. + You can see the RTF is less than 1. + + If you has some more performant GPU, you would get an even lower RTF. + +Run with GPU (int8) +------------------- + +.. code-block:: bash + + cd /content + exe=$PWD/sherpa-onnx/build/bin/sherpa-onnx-offline + + cd sherpa-onnx-whisper-large-v3 + + time $exe \ + --whisper-encoder=./large-v3-encoder.int8.onnx \ + --whisper-decoder=./large-v3-decoder.int8.onnx \ + --tokens=./large-v3-tokens.txt \ + --provider=cuda \ + --num-threads=2 \ + ./test_wavs/0.wav + +The logs are given below:: + + /content/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 /content/sherpa-onnx/build/bin/sherpa-onnx-offline --whisper-encoder=./large-v3-encoder.int8.onnx --whisper-decoder=./large-v3-decoder.int8.onnx --tokens=./large-v3-tokens.txt --provider=cuda --num-threads=2 ./test_wavs/0.wav + + OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="./large-v3-encoder.int8.onnx", decoder="./large-v3-decoder.int8.onnx", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), telespeech_ctc="", tokens="./large-v3-tokens.txt", num_threads=2, debug=False, provider="cuda", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") + Creating recognizer ... + Started + Done! + + ./test_wavs/0.wav + {"text": " after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels", "timestamps": [], "tokens":[" after", " early", " night", "fall", " the", " yellow", " lamps", " would", " light", " up", " here", " and", " there", " the", " squ", "alid", " quarter", " of", " the", " broth", "els"], "words": []} + ---- + num threads: 2 + decoding method: greedy_search + Elapsed seconds: 19.190 s + Real time factor (RTF): 19.190 / 6.625 = 2.897 + + real 0m46.850s + user 0m50.007s + sys 0m8.013s + +Fix issues about running on GPU +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you get errors like below:: + + what(): /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1426 + onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() + [ONNXRuntimeError] : 1 : FAIL : + Failed to load library libonnxruntime_providers_cuda.so with error: + libcublasLt.so.11: cannot open shared object file: No such file or directory + +please follow ``_ +to install CUDA toolkit. + +To determine which version of CUDA toolkit to install, please read +``_ +to figure it out. + +For instance, if onnxruntime v1.18.1 is used in `sherpa-onnx`_, we have to install +CUDA 11.8 according to ``_ + +colab +----- + +Please see the following colab notebook +|sherpa-onnx with whisper large-v3 colab notebook|. + +It walks you step by step to try the exported large-v3 onnx model with `sherpa-onnx`_ +on CPU as well as on GPU. + +.. |sherpa-onnx with whisper large-v3 colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_whisper_large_v3.ipynb + diff --git a/docs/source/onnx/pretrained_models/whisper/pic/whisper-colab.png b/docs/source/onnx/pretrained_models/whisper/pic/whisper-colab.png new file mode 100644 index 000000000..c6ee8b69b Binary files /dev/null and b/docs/source/onnx/pretrained_models/whisper/pic/whisper-colab.png differ diff --git a/docs/source/onnx/pretrained_models/whisper/pic/whisper-hf.png b/docs/source/onnx/pretrained_models/whisper/pic/whisper-hf.png new file mode 100644 index 000000000..ff5162887 Binary files /dev/null and b/docs/source/onnx/pretrained_models/whisper/pic/whisper-hf.png differ diff --git a/docs/source/onnx/pretrained_models/whisper/tiny.en.rst b/docs/source/onnx/pretrained_models/whisper/tiny.en.rst new file mode 100644 index 000000000..858d92880 --- /dev/null +++ b/docs/source/onnx/pretrained_models/whisper/tiny.en.rst @@ -0,0 +1,130 @@ +.. _whisper_tiny_en_sherpa_onnx: + +tiny.en +======= + +You can use the following commands to download the exported `onnx`_ models of ``tiny.en``: + +.. hint:: + + Please replace ``tiny.en`` with + ``base.en``, ``small.en``, ``medium.en``, ``distil-small.en``, + ``tiny``, ``base``, ``small``, and ``medium`` + if you want to try a different type of model. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + +Please check that the file sizes of the downloaded models are correct. See +the file size of ``*.onnx`` files below. + +.. code-block:: bash + + (py38) fangjuns-MacBook-Pro:sherpa-onnx-whisper-tiny.en fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 105M Aug 7 16:22 tiny.en-decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 185M Aug 7 16:23 tiny.en-decoder.onnx + -rw-r--r-- 1 fangjun staff 12M Aug 7 16:22 tiny.en-encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 36M Aug 7 16:22 tiny.en-encoder.onnx + +To use the downloaded files to decode waves, please run: + +.. hint:: + + Please first follow :ref:`install_sherpa_onnx` to build `sherpa-onnx`_ + before you continue. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx \ + --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx \ + --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + +To use ``int8`` quantized models, please use: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline \ + --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx \ + --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx \ + --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + +Real-time factor (RTF) on Raspberry Pi 4 Model B +------------------------------------------------ + +One of the test command is given below: + +.. code-block:: bash + + ./sherpa-onnx-offline \ + --num-threads=1 \ + --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx \ + --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx \ + --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav + +And its output is: + +.. code-block:: bash + + /root/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./sherpa-onnx-offline --num-threads=1 --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav + + OfflineRecognizerConfig(feat_config=OfflineFeatureExtractorConfig(sampling_rate=16000, feature_dim=80), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx", decoder="./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx"), tokens="./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt", num_threads=1, debug=False, provider="cpu", model_type=""), lm_config=OfflineLMConfig(model="", scale=0.5), decoding_method="greedy_search", max_active_paths=4, context_score=1.5) + Creating recognizer ... + Started + Done! + + ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav + {"text":" God, as a direct consequence of the sin which man thus punished, had given her a lovely child, whose place was on that same dishonored bosom to connect her parent forever with the race and descent of mortals, and to be finally a blessed soul in heaven.","timestamps":"[]","tokens":[" God",","," as"," a"," direct"," consequence"," of"," the"," sin"," which"," man"," thus"," punished",","," had"," given"," her"," a"," lovely"," child",","," whose"," place"," was"," on"," that"," same"," dishon","ored"," bos","om"," to"," connect"," her"," parent"," forever"," with"," the"," race"," and"," descent"," of"," mortals",","," and"," to"," be"," finally"," a"," blessed"," soul"," in"," heaven","."]} + ---- + num threads: 1 + decoding method: greedy_search + Elapsed seconds: 11.454 s + Real time factor (RTF): 11.454 / 16.715 = 0.685 + +The following table compares the RTF between different number of threads and types of `onnx`_ models: + + +.. list-table:: + + * - Model type + - Number of threads + - RTF + * - float32 + - 1 + - 0.685 + * - float32 + - 2 + - 0.559 + * - float32 + - 3 + - 0.526 + * - float32 + - 4 + - 0.520 + * - int8 + - 1 + - 0.547 + * - int8 + - 2 + - 0.431 + * - int8 + - 3 + - 0.398 + * - int8 + - 4 + - 0.386 diff --git a/docs/source/onnx/punctuation/index.rst b/docs/source/onnx/punctuation/index.rst new file mode 100644 index 000000000..ad5b80aab --- /dev/null +++ b/docs/source/onnx/punctuation/index.rst @@ -0,0 +1,16 @@ +Punctuation +=========== + +This section introduces the models that `sherpa-onnx`_ supports for adding +punctuations to text. + +.. hint:: + + After getting text from speech using speech-to-text, you can use models + from this section to add punctuations to text. + + +.. toctree:: + :maxdepth: 5 + + ./pretrained_models.rst diff --git a/docs/source/onnx/punctuation/pretrained_models.rst b/docs/source/onnx/punctuation/pretrained_models.rst new file mode 100644 index 000000000..0fa71de41 --- /dev/null +++ b/docs/source/onnx/punctuation/pretrained_models.rst @@ -0,0 +1,139 @@ +Pre-trained models +================== + +This section lists pre-trained models for adding punctuations to text. + +You can find all models at the following URL: + + ``_ + +sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 +------------------------------------------------------------- + +This model is converted from + + ``_ + +and it supports both Chinese and English. + +.. hint:: + + If you want to know how the model is converted to `sherpa-onnx`_, please download + it and you can find related scripts in the downloaded model directory. + +In the following, we describe how to download and use it with `sherpa-onnx`_. + +Download the model +^^^^^^^^^^^^^^^^^^ + +Please use the following commands to download it:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2 + + tar xvf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2 + rm sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2 + +You will find the following files after unzipping:: + + -rw-r--r-- 1 fangjun staff 1.4K Apr 12 12:32 README.md + -rwxr-xr-x 1 fangjun staff 1.6K Apr 12 14:40 add-model-metadata.py + -rw-r--r-- 1 fangjun staff 810B Apr 12 11:56 config.yaml + -rw-r--r-- 1 fangjun staff 42B Apr 12 11:45 configuration.json + -rw-r--r-- 1 fangjun staff 281M Apr 12 14:40 model.onnx + -rwxr-xr-x 1 fangjun staff 745B Apr 12 11:53 show-model-input-output.py + -rwxr-xr-x 1 fangjun staff 4.9K Apr 13 18:45 test.py + -rw-r--r-- 1 fangjun staff 4.0M Apr 12 11:56 tokens.json + +Only ``model.onnx`` is needed in `sherpa-onnx`_. All other files are for your information about +how the model is converted to `sherpa-onnx`_. + +C++ binary examples +^^^^^^^^^^^^^^^^^^^ + +After installing `sherpa-onnx`_, you can use the following command to add punctuations to text:: + + ./bin/sherpa-onnx-offline-punctuation \ + --ct-transformer=./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx \ + "我们都是木头人不会说话不会动" + +The output is given below:: + + /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./bin/sherpa-onnx-offline-punctuation --ct-transformer=./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx '我们都是木头人不会说话不会动' + + OfflinePunctuationConfig(model=OfflinePunctuationModelConfig(ct_transformer="./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx", num_threads=1, debug=False, provider="cpu")) + Creating OfflinePunctuation ... + Started + Done + Num threads: 1 + Elapsed seconds: 0.007 s + Input text: 我们都是木头人不会说话不会动 + Output text: 我们都是木头人,不会说话不会动。 + +The second example is for text containing both Chinese and English:: + + ./bin/sherpa-onnx-offline-punctuation \ + --ct-transformer=./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx \ + "这是一个测试你好吗How are you我很好thank you are you ok谢谢你" + +Its output is given below:: + + /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./bin/sherpa-onnx-offline-punctuation --ct-transformer=./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx '这是一个测试你好吗How are you我很好thank you are you ok谢谢你' + + OfflinePunctuationConfig(model=OfflinePunctuationModelConfig(ct_transformer="./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx", num_threads=1, debug=False, provider="cpu")) + Creating OfflinePunctuation ... + Started + Done + Num threads: 1 + Elapsed seconds: 0.005 s + Input text: 这是一个测试你好吗How are you我很好thank you are you ok谢谢你 + Output text: 这是一个测试,你好吗?How are you?我很好?thank you,are you ok,谢谢你。 + +The last example is for text containing only English:: + + ./bin/sherpa-onnx-offline-punctuation \ + --ct-transformer=./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx \ + "The African blogosphere is rapidly expanding bringing more voices online in the form of commentaries opinions analyses rants and poetry" + +Its output is given below:: + + /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:361 ./bin/sherpa-onnx-offline-punctuation --ct-transformer=./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx 'The African blogosphere is rapidly expanding bringing more voices online in the form of commentaries opinions analyses rants and poetry' + + OfflinePunctuationConfig(model=OfflinePunctuationModelConfig(ct_transformer="./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx", num_threads=1, debug=False, provider="cpu")) + Creating OfflinePunctuation ... + Started + Done + Num threads: 1 + Elapsed seconds: 0.003 s + Input text: The African blogosphere is rapidly expanding bringing more voices online in the form of commentaries opinions analyses rants and poetry + Output text: The African blogosphere is rapidly expanding,bringing more voices online in the form of commentaries,opinions,analyses,rants and poetry。 + +Python API examples +^^^^^^^^^^^^^^^^^^^ + +Please see + + ``_ + +Huggingface space examples +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please see + + - ``_ + - ``_ + +.. hint:: + + For Chinese users, please visit the following mirrors: + + - ``_ + - ``_ + +Video demos +^^^^^^^^^^^ + +The following `video `_ is in Chinese. + +.. raw:: html + + diff --git a/docs/source/onnx/python/code/decode-files/non-streaming-paraformer-2023-03-28.txt b/docs/source/onnx/python/code/decode-files/non-streaming-paraformer-2023-03-28.txt new file mode 100644 index 000000000..0d1f320fc --- /dev/null +++ b/docs/source/onnx/python/code/decode-files/non-streaming-paraformer-2023-03-28.txt @@ -0,0 +1,23 @@ +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Started! +Done! +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav +对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你 +---------- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav +重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现 +---------- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav +深入的分析这一次全球金融动荡背后的根源 +---------- +./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav +甚至出现交易几乎停滞的情况 +---------- +num_threads: 1 +decoding_method: greedy_search +Wave duration: 4.204 s +Elapsed time: 1.663 s +Real time factor (RTF): 1.663/4.204 = 0.396 diff --git a/docs/source/onnx/python/code/decode-files/non-streaming-transducer-zipformer-2023-04-01.txt b/docs/source/onnx/python/code/decode-files/non-streaming-transducer-zipformer-2023-04-01.txt new file mode 100644 index 000000000..9967d6400 --- /dev/null +++ b/docs/source/onnx/python/code/decode-files/non-streaming-transducer-zipformer-2023-04-01.txt @@ -0,0 +1,20 @@ +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Started! +Done! +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS +---------- +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN +---------- +./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION +---------- +num_threads: 1 +decoding_method: greedy_search +Wave duration: 4.825 s +Elapsed time: 2.567 s +Real time factor (RTF): 2.567/4.825 = 0.532 diff --git a/docs/source/onnx/python/code/decode-files/streaming-transducer-bilingual-zh-en-2023-02-20.txt b/docs/source/onnx/python/code/decode-files/streaming-transducer-bilingual-zh-en-2023-02-20.txt new file mode 100644 index 000000000..6f59ef7d4 --- /dev/null +++ b/docs/source/onnx/python/code/decode-files/streaming-transducer-bilingual-zh-en-2023-02-20.txt @@ -0,0 +1,26 @@ +Creating a resampler: + in_sample_rate: 8000 + output_sample_rate: 16000 + +Started! +Done! +./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav +昨天是 MONDAY TODAY IS LIBR THE DAY AFTER TOMORROW是星期三 +---------- +./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav +这是第一种第二种叫呃与 ALWAYS ALWAYS什么意思啊 +---------- +./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/2.wav +这个是频繁的啊不认识记下来 FREQUENTLY频繁的 +---------- +./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/3.wav +第一句是个什么时态加了 ES是一般现在时对后面还有时态写上 +---------- +./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/8k.wav +嗯太不要准时 IN TIME是及时叫他总是准时教他的作业那用一般现在时是没有什么感情色彩的陈述一个事实下一句话为什么要用现在进行时它的意思并不是说说他现在正在教他的 +---------- +num_threads: 1 +decoding_method: greedy_search +Wave duration: 17.640 s +Elapsed time: 3.907 s +Real time factor (RTF): 3.907/17.640 = 0.221 diff --git a/docs/source/onnx/python/decode-files.rst b/docs/source/onnx/python/decode-files.rst new file mode 100644 index 000000000..e2fb391c7 --- /dev/null +++ b/docs/source/onnx/python/decode-files.rst @@ -0,0 +1,105 @@ +Decode files +============ + +In this section, we demonstrate how to use the Python API of `sherpa-onnx`_ +to decode files. + +.. hint:: + + We only support WAVE files of single channel and each sample should have + 16-bit, while the sample rate of the file can be arbitrary and it does + not need to be 16 kHz + + +Streaming zipformer +-------------------- + +We use :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20` as +an example below. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/online-decode-files.py \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/1.wav \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/2.wav \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/3.wav \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/8k.wav + +.. hint:: + + ``online-decode-files.py`` is from ``_ + +.. note:: + + You can replace ``encoder-epoch-99-avg-1.onnx`` with ``encoder-epoch-99-avg-1.int8.onnx`` + to use ``int8`` models for decoding. + +The output is given below: + +.. literalinclude:: ./code/decode-files/streaming-transducer-bilingual-zh-en-2023-02-20.txt + +Non-streaming zipformer +----------------------- + +We use :ref:`sherpa_onnx_zipformer_en_2023_04_01` as +an example below. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-decode-files.py \ + --tokens=./sherpa-onnx-zipformer-en-2023-04-01/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-04-01/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-04-01/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-04-01/joiner-epoch-99-avg-1.onnx \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-04-01/test_wavs/8k.wav + +.. hint:: + + ``offline-decode-files.py`` is from ``_ + +.. note:: + + You can replace ``encoder-epoch-99-avg-1.onnx`` with ``encoder-epoch-99-avg-1.int8.onnx`` + to use ``int8`` models for decoding. + +The output is given below: + +.. literalinclude:: ./code/decode-files/non-streaming-transducer-zipformer-2023-04-01.txt + +Non-streaming paraformer +------------------------ + +We use :ref:`sherpa_onnx_offline_paraformer_zh_2023_03_28_chinese` as +an example below. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-decode-files.py \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +.. note:: + + You can replace ``model.onnx`` with ``model.int8.onnx`` + to use ``int8`` models for decoding. + +The output is given below: + +.. literalinclude:: ./code/decode-files/non-streaming-paraformer-2023-03-28.txt diff --git a/docs/source/onnx/python/index.rst b/docs/source/onnx/python/index.rst new file mode 100644 index 000000000..5e3f223df --- /dev/null +++ b/docs/source/onnx/python/index.rst @@ -0,0 +1,14 @@ +Python +====== + +In this section, we describe how to install the Python package `sherpa-onnx`_. + +.. toctree:: + :maxdepth: 5 + + ./install.rst + ./decode-files.rst + ./real-time-speech-recongition-from-a-microphone.rst + ./speech-recognition-from-urls.rst + ./streaming-websocket-server.rst + ./non-streaming-websocket-server.rst diff --git a/docs/source/onnx/python/install.rst b/docs/source/onnx/python/install.rst new file mode 100644 index 000000000..6bb67fde4 --- /dev/null +++ b/docs/source/onnx/python/install.rst @@ -0,0 +1,210 @@ +.. _install_sherpa_onnx_python: + +Install the Python Package +========================== + +You can select one of the following methods to install the Python package. + +Method 1 (From pre-compiled wheels, CPU only) +--------------------------------------------- + +.. hint:: + + This method supports the following platfroms: + + - Linux (``x64``, ``aarch64``, ``armv7l``), + - macOS (``x64``, ``arm64``) + - Windows (``x64``, ``x86``) + + Note that this method installs a CPU-only version of `sherpa-onnx`_. + +.. code-block:: bash + + pip install sherpa-onnx + +To check you have installed `sherpa-onnx`_ successfully, please run + +.. code-block:: bash + + python3 -c "import sherpa_onnx; print(sherpa_onnx.__file__)" + + which sherpa-onnx + sherpa-onnx --help + + ls -lh $(dirname $(which sherpa-onnx))/sherpa-onnx* + +.. hint:: + + You can find previous releases at + ``_ + + For Chinese users and users who have no access to huggingface, please visit + ``_. + + You can use:: + + pip install sherpa-onnx -f https://k2-fsa.github.io/sherpa/onnx/cpu.html + + or:: + + # For Chinese uers + pip install sherpa-onnx -f https://k2-fsa.github.io/sherpa/onnx/cpu-cn.html + +Method 2 (From pre-compiled wheels, CPU + CUDA) +------------------------------------------------ + +.. note:: + + This method installs a version of `sherpa-onnx`_ supporting both ``CUDA`` + and ``CPU``. You need to pass the argument ``provider=cuda`` to use + NVIDIA GPU, which always uses GPU 0. Otherwise, it uses ``CPU`` by default. + + Please use the environment variable ``CUDA_VISIBLE_DEVICES`` to control + which GPU is mapped to GPU 0. + + By default, ``provider`` is set to ``cpu``. + + Remeber to follow ``_ + to install CUDA 11.8. + + If you have issues about installing CUDA 11.8, please have a look at + ``_. + + Note that you don't need to have ``sudo`` permission to install CUDA 11.8 + +This approach supports only Linux x64 and Windows x64. + +Please use the following command to install CUDA-enabled `sherpa-onnx`_:: + + # We use 1.10.16 here for demonstration. + # + # Please visit https://k2-fsa.github.io/sherpa/onnx/cuda.html + # to find available versions + + pip install sherpa-onnx==1.10.16+cuda -f https://k2-fsa.github.io/sherpa/onnx/cuda.html + + # For Chinese users, please use + # pip install sherpa-onnx==1.10.16+cuda -f https://k2-fsa.github.io/sherpa/onnx/cuda-cn.html + +The installation logs are given below:: + + Looking in links: https://k2-fsa.github.io/sherpa/onnx/cuda.html + Collecting sherpa-onnx==1.10.16+cuda + Downloading https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cuda/1.10.16/sherpa_onnx-1.10.16%2Bcuda-cp310-cp310-linux_x86_64.whl (183.3 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 183.3/183.3 MB 4.4 MB/s eta 0:00:00 + Installing collected packages: sherpa-onnx + Successfully installed sherpa-onnx-1.10.16+cuda + +To check that you have installed `sherpa-onnx`_ successfully, please run:: + + python3 -c "import sherpa_onnx; print(sherpa_onnx.__version__)" + +which should print something like below:: + + 1.10.16+cuda + + + +Method 3 (From source) +---------------------- + +.. tabs:: + + .. tab:: CPU + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + python3 setup.py install + + .. tab:: Nvidia GPU (CUDA) + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + export SHERPA_ONNX_CMAKE_ARGS="-DSHERPA_ONNX_ENABLE_GPU=ON" + cd sherpa-onnx + python3 setup.py install + +Method 4 (For developers) +------------------------- + +.. tabs:: + + .. tab:: CPU + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=ON \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=OFF \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + .. + + make -j + export PYTHONPATH=$PWD/../sherpa-onnx/python/:$PWD/lib:$PYTHONPATH + + .. tab:: Nvidia GPU (CUDA) + + .. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=ON \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=OFF \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DSHERPA_ONNX_ENABLE_GPU=ON \ + .. + + make -j + export PYTHONPATH=$PWD/../sherpa-onnx/python/:$PWD/lib:$PYTHONPATH + + .. hint:: + + You need to install CUDA toolkit. Otherwise, you would get + errors at runtime. + + You can refer to ``_ + to install CUDA toolkit. + + +Check your installation +----------------------- + +To check that `sherpa-onnx`_ has been successfully installed, please use: + +.. code-block:: bash + + python3 -c "import sherpa_onnx; print(sherpa_onnx.__file__)" + +It should print some output like below: + +.. code-block:: bash + + /Users/fangjun/py38/lib/python3.8/site-packages/sherpa_onnx/__init__.py + +Please refer to: + + ``_ + +for usages. + +Please refer to :ref:`sherpa-onnx-pre-trained-models` for a list of pre-trained +models. + diff --git a/docs/source/onnx/python/non-streaming-websocket-server.rst b/docs/source/onnx/python/non-streaming-websocket-server.rst new file mode 100644 index 000000000..6127a66cb --- /dev/null +++ b/docs/source/onnx/python/non-streaming-websocket-server.rst @@ -0,0 +1,334 @@ +Non-Streaming WebSocket Server +============================== + +This section describes how to use the Python non-streaming WebSocket server +of `sherpa-onnx`_ for speech recognition. + +.. hint:: + + The server supports multiple clients connecting at the same time. + +The code for the non-streaming server can be found at + + ``_ + +Please refer to :ref:`sherpa-onnx-pre-trained-models` to download a non-streaming model +before you continue. + +We use the following types of models for demonstration. + +.. list-table:: + + * - Description + - URL + * - Non-streaming transducer + - :ref:`sherpa-onnx-zipformer-en-2023-06-26-english` + * - Non-streaming paraformer + - :ref:`sherpa_onnx_offline_paraformer_zh_2023_03_28_chinese` + * - Non-streaming CTC model from NeMo + - :ref:`stt-en-conformer-ctc-medium-nemo-sherpa-onnx` + * - Non-streaming Whisper tiny.en + - :ref:`whisper_tiny_en_sherpa_onnx` + +Non-streaming transducer +------------------------ + +Start the server +^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 + + python3 ./python-api-examples/non_streaming_server.py \ + --encoder ./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx \ + --decoder ./sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx \ + --joiner ./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx \ + --tokens ./sherpa-onnx-zipformer-en-2023-06-26/tokens.txt \ + --port 6006 + +Start the client +^^^^^^^^^^^^^^^^ + +**Decode multiple files in parallel** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:19:26,000 INFO [offline-websocket-client-decode-files-paralell.py:139] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav', './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav', './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav']} + 2023-08-11 18:19:26,034 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + 2023-08-11 18:19:26,058 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav + 2023-08-11 18:19:26,205 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav + 2023-08-11 18:19:26,262 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + 2023-08-11 18:19:26,609 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + 2023-08-11 18:19:26,773 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + +**Decode multiple files sequentially** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-sequential.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:20:36,677 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + 2023-08-11 18:20:36,861 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/1.wav + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + 2023-08-11 18:20:37,375 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + +Non-streaming paraformer +------------------------ + +Start the server +^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + python3 ./python-api-examples/non_streaming_server.py \ + --paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --port 6006 + +Start the client +^^^^^^^^^^^^^^^^ + +**Decode multiple files in parallel** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:22:54,189 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav + 2023-08-11 18:22:54,233 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav + 2023-08-11 18:22:54,275 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + 2023-08-11 18:22:54,295 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav + 2023-08-11 18:22:54,380 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav + 对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你 + 2023-08-11 18:22:54,673 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + 甚至出现交易几乎停滞的情况 + 2023-08-11 18:22:54,673 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav + 深入的分析这一次全球金融动荡背后的根源 + 2023-08-11 18:22:54,674 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav + 重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现 + +**Decode multiple files sequentially** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-sequential.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:24:32,678 INFO [offline-websocket-client-decode-files-sequential.py:141] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav', './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav', './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav', './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav']} + 2023-08-11 18:24:32,709 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav + 对我做了介绍啊那么我想说的是呢大家如果对我的研究感兴趣呢你 + 2023-08-11 18:24:32,883 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav + 重点呢想谈三个问题首先呢就是这一轮全球金融动荡的表现 + 2023-08-11 18:24:33,042 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav + 深入的分析这一次全球金融动荡背后的根源 + 2023-08-11 18:24:33,175 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + 甚至出现交易几乎停滞的情况 + +Non-streaming CTC model from NeMo +--------------------------------- + +Start the server +^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + tar xvf sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + rm sherpa-onnx-nemo-ctc-en-conformer-medium.tar.bz2 + + python3 ./python-api-examples/non_streaming_server.py \ + --nemo-ctc ./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \ + --tokens ./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt \ + --port 6006 + +Start the client +^^^^^^^^^^^^^^^^ + +**Decode multiple files in parallel** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:31:32,432 INFO [offline-websocket-client-decode-files-paralell.py:139] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav', './sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav', './sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav']} + 2023-08-11 18:31:32,462 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + 2023-08-11 18:31:32,513 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + 2023-08-11 18:31:32,533 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav + 2023-08-11 18:31:32,670 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels + 2023-08-11 18:31:32,741 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + yet these thoughts affected hester pryne less with hope than apprehension + 2023-08-11 18:31:33,117 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonored bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven + +**Decode multiple files sequentially** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-sequential.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav \ + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:33:14,520 INFO [offline-websocket-client-decode-files-sequential.py:141] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav', './sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav', './sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav']} + 2023-08-11 18:33:14,547 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav + after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels + 2023-08-11 18:33:14,716 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/1.wav + god as a direct consequence of the sin which man thus punished had given her a lovely child whose place was on that same dishonored bosom to connect her parent for ever with the race and descent of mortals and to be finally a blessed soul in heaven + 2023-08-11 18:33:15,218 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/8k.wav + yet these thoughts affected hester pryne less with hope than apprehension + +Non-streaming Whisper tiny.en +----------------------------- + +Start the server +^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 + + python3 ./python-api-examples/non_streaming_server.py \ + --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx \ + --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx \ + --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \ + --port 6006 + +Start the client +^^^^^^^^^^^^^^^^ + +**Decode multiple files in parallel** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:35:28,866 INFO [offline-websocket-client-decode-files-paralell.py:139] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav', './sherpa-onnx-whisper-tiny.en/test_wavs/1.wav', './sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav']} + 2023-08-11 18:35:28,894 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav + 2023-08-11 18:35:28,947 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav + 2023-08-11 18:35:29,082 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + 2023-08-11 18:35:29,754 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav + After early nightfall, the yellow lamps would light up here and there, the squalid quarter of the brothels. + 2023-08-11 18:35:30,276 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + Yet these thoughts affected Hester Prin less with hope than apprehension. + 2023-08-11 18:35:31,592 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav + God, as a direct consequence of the sin which man thus punished, had given her a lovely child, whose place was on that same dishonored bosom to connect her parent forever with the race and descent of mortals, and to be finally a blessed soul in heaven. + +**Decode multiple files sequentially** + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-sequential.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav \ + ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + +You should see the following output: + +.. code-block:: bash + + 2023-08-11 18:36:42,148 INFO [offline-websocket-client-decode-files-sequential.py:141] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav', './sherpa-onnx-whisper-tiny.en/test_wavs/1.wav', './sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav']} + 2023-08-11 18:36:42,176 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav + After early nightfall, the yellow lamps would light up here and there, the squalid quarter of the brothels. + 2023-08-11 18:36:42,926 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav + God, as a direct consequence of the sin which man thus punished, had given her a lovely child, whose place was on that same dishonored bosom to connect her parent forever with the race and descent of mortals, and to be finally a blessed soul in heaven. + 2023-08-11 18:36:44,314 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav + Yet these thoughts affected Hester Prin less with hope than apprehension. + +colab +----- + +We provide a colab notebook +|Sherpa-onnx python non-streaming websocket example colab notebook| +for you to try the Python non-streaming websocket server example of `sherpa-onnx`_. + +.. |Sherpa-onnx python non-streaming websocket example colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_python_non_streaming_websocket_server.ipynb diff --git a/docs/source/onnx/python/pic/streaming-1.png b/docs/source/onnx/python/pic/streaming-1.png new file mode 100644 index 000000000..1edff7107 Binary files /dev/null and b/docs/source/onnx/python/pic/streaming-1.png differ diff --git a/docs/source/onnx/python/pic/streaming-2.png b/docs/source/onnx/python/pic/streaming-2.png new file mode 100644 index 000000000..42a97a891 Binary files /dev/null and b/docs/source/onnx/python/pic/streaming-2.png differ diff --git a/docs/source/onnx/python/real-time-speech-recongition-from-a-microphone.rst b/docs/source/onnx/python/real-time-speech-recongition-from-a-microphone.rst new file mode 100644 index 000000000..4fe4e642f --- /dev/null +++ b/docs/source/onnx/python/real-time-speech-recongition-from-a-microphone.rst @@ -0,0 +1,45 @@ +Real-time speech recognition from a microphone +============================================== + +In this section, we demonstrate how to use the Python API of `sherpa-onnx`_ +for real-time speech recognition with a microphone. + +With endpoint detection +----------------------- + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx + +.. hint:: + + ``speech-recognition-from-microphone-with-endpoint-detection.py`` is from ``_ + + In the above demo, the model files are + from :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20`. + +Without endpoint detection +-------------------------- + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/speech-recognition-from-microphone.py \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx + +.. hint:: + + ``speech-recognition-from-microphone.py`` is from ``_ + + In the above demo, the model files are + from :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20`. diff --git a/docs/source/onnx/python/speech-recognition-from-urls.rst b/docs/source/onnx/python/speech-recognition-from-urls.rst new file mode 100644 index 000000000..fb790c41e --- /dev/null +++ b/docs/source/onnx/python/speech-recognition-from-urls.rst @@ -0,0 +1,196 @@ +Speech recognition from URLs +============================ + +`sherpa-onnx`_ also supports decoding from URLs. + +.. hint:: + + Only streaming models are currently supported. Please modify the + `code `_ + for non-streaming models on need. + +All types of URLs supported by ``ffmpeg`` are supported. + +The following table lists some example URLs. + +.. list-table:: + + * - Type + - Example + * - `RTMP`_ + - ``rtmp://localhost/live/livestream`` + * - OPUS file + - ``_ + * - WAVE file + - ``_ + * - Local WAVE file + - ``file:///Users/fangjun/open-source/sherpa-onnx/a.wav`` + +Before you continue, please install ``ffmpeg`` first. + +For instance, you can use ``sudo apt-get install ffmpeg`` for Ubuntu +and ``brew install ffmpeg`` for macOS. + +We use the model :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20` +for demonstration in the following examples. + +Decode a URL +------------ + +This example shows you how to decode a URL pointing to a file. + +.. hint:: + + The file does not need to be a WAVE file. It can be a file of any format + supported by ``ffmpeg``. + +.. code-block:: bash + + python3 ./python-api-examples/speech-recognition-from-url.py \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --url https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/librispeech/1089-134686-0001.wav + +RTMP +---- + +In this example, we use ``ffmpeg`` to capture a microphone and push the +audio stream to a server using RTMP, and then we start `sherpa-onnx`_ to pull +the audio stream from the server for recognition. + +Install the server +~~~~~~~~~~~~~~~~~~ + +We will use `srs`_ as the server. Let us first install `srs`_ from source: + +.. code-block:: bash + + git clone -b develop https://github.com/ossrs/srs.git + cd srs/trunk + ./configure + make + + # Check that we have compiled srs successfully + ./objs/srs --help + + # Note: ./objs/srs is statically linked and depends only on system libraries. + +Start the server +~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + ulimit -HSn 10000 + + # switch to the directory srs/trunk + ./objs/srs -c conf/srs.conf + +The above command gives the following output: + +.. code-block:: bash + + srs(51047,0x7ff8451198c0) malloc: nano zone abandoned due to inability to preallocate reserved vm space. + Asan: Please setup the env MallocNanoZone=0 to disable the warning, see https://stackoverflow.com/a/70209891/17679565 + [2023-07-05 12:19:23.017][INFO][51047][78gw8v44] XCORE-SRS/6.0.55(Bee) + [2023-07-05 12:19:23.021][INFO][51047][78gw8v44] config parse complete + [2023-07-05 12:19:23.021][INFO][51047][78gw8v44] you can check log by: tail -n 30 -f ./objs/srs.log + [2023-07-05 12:19:23.021][INFO][51047][78gw8v44] please check SRS by: ./etc/init.d/srs status + + +To check the status of `srs`_, use + +.. code-block:: bash + + ./etc/init.d/srs status + +which gives the following output: + +.. code-block:: bash + + SRS(pid 51548) is running. [ OK ] + +.. hint:: + + If you fail to start the `srs`_ server, please check the log file + ``./objs/srs.log`` for a fix. + +Start ffmpeg to push audio stream +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First, let us list available recording devices on the current computer +with the following command: + +.. code-block:: bash + + ffmpeg -hide_banner -f avfoundation -list_devices true -i "" + +It gives the following output on my computer: + +.. code-block:: bash + + [AVFoundation indev @ 0x7f9f41904840] AVFoundation video devices: + [AVFoundation indev @ 0x7f9f41904840] [0] FaceTime HD Camera (Built-in) + [AVFoundation indev @ 0x7f9f41904840] [1] Capture screen 0 + [AVFoundation indev @ 0x7f9f41904840] AVFoundation audio devices: + [AVFoundation indev @ 0x7f9f41904840] [0] Background Music + [AVFoundation indev @ 0x7f9f41904840] [1] MacBook Pro Microphone + [AVFoundation indev @ 0x7f9f41904840] [2] Background Music (UI Sounds) + [AVFoundation indev @ 0x7f9f41904840] [3] WeMeet Audio Device + : Input/output error + +We will use the device ``[1] MacBook Pro Microphone``. Note that its index +is ``1``, so we will use ``-i ":1"`` in the following command to start +recording and push the recorded audio stream to the server under the +address ``rtmp://localhost/live/livestream``. + +.. hint:: + + The default TCP port for `RTMP`_ is ``1935``. + +.. code-block:: bash + + ffmpeg -hide_banner -f avfoundation -i ":1" -acodec aac -ab 64k -ar 16000 -ac 1 -f flv rtmp://localhost/live/livestream + +The above command gives the following output: + +.. code-block:: bash + + Input #0, avfoundation, from ':1': + Duration: N/A, start: 830938.803938, bitrate: 1536 kb/s + Stream #0:0: Audio: pcm_f32le, 48000 Hz, mono, flt, 1536 kb/s + Stream mapping: + Stream #0:0 -> #0:0 (pcm_f32le (native) -> aac (native)) + Press [q] to stop, [?] for help + Output #0, flv, to 'rtmp://localhost/live/livestream': + Metadata: + encoder : Lavf60.3.100 + Stream #0:0: Audio: aac (LC) ([10][0][0][0] / 0x000A), 16000 Hz, mono, fltp, 64 kb/s + Metadata: + encoder : Lavc60.3.100 aac + size= 64kB time=00:00:08.39 bitrate= 62.3kbits/s speed=0.977x + + +Start sherpa-onnx to pull audio stream +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now we can start `sherpa-onnx`_ to pull audio stream from ``rtmp://localhost/live/livestream`` +for speech recognition. + +.. code-block:: bash + + python3 ./python-api-examples/speech-recognition-from-url.py \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --url rtmp://localhost/live/livestream + +You should see the recognition result printed to the console as you speak. + +.. hint:: + + You can replace ``localhost`` with your server IP + and start `sherpa-onnx`_ on many computers at the same time to pull + audio stream from the address ``_. diff --git a/docs/source/onnx/python/streaming-websocket-server.rst b/docs/source/onnx/python/streaming-websocket-server.rst new file mode 100644 index 000000000..4303d49a6 --- /dev/null +++ b/docs/source/onnx/python/streaming-websocket-server.rst @@ -0,0 +1,219 @@ +Streaming WebSocket Server +========================== + +This section describes how to use the Python streaming WebSocket server +of `sherpa-onnx`_ for speech recognition. + +.. hint:: + + The server supports multiple clients connecting at the same time. + +The code for the streaming server can be found at + + ``_ + +Start the server +---------------- + +.. hint:: + + If you don't use a `X.509 `_ certificate, + due to security reasons imposed by the browser, you are only allowed to + use the domain ``localhost`` to access the server if you want to access the + microphone in the browser. That is, you can only use + + ``_ + + to access the server. You cannot use ``_, or + ``_, or ``_. + + You can use the following command to generate a self-signed certificate: + + .. code-block:: bash + + cd python-api-examples/web + ./generate-certificate.py + + The above commands will generate 3 files. You only need to use the file + ``cert.pem``. When starting the server, you pass the following argument: + + .. code-block:: bash + + --certificate=./python-api-examples/web/cert.pem + + +Please refer to :ref:`sherpa-onnx-pre-trained-models` to download a streaming model +before you continue. + +We will use :ref:`sherpa-onnx-streaming-zipformer-en-2023-06-26-english` as an example. + +First, let us download it: + +.. code-block:: bash + + cd /path/to/sherpa-onnx/ + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + rm sherpa-onnx-streaming-zipformer-en-2023-06-26.tar.bz2 + +Now we can use: + +.. code-block:: bash + + cd /path/to/sherpa-onnx/ + + python3 ./python-api-examples/streaming_server.py \ + --encoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \ + --port 6006 + +It will print the following logs: + +.. code-block:: bash + + 2023-08-11 16:29:51,522 INFO [streaming_server.py:678] {'encoder': './sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-128.onnx', 'decoder': './sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-128.onnx', 'joiner': './sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-128.onnx', 'tokens': './sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt', 'sample_rate': 16000, 'feat_dim': 80, 'provider': 'cpu', 'decoding_method': 'greedy_search', 'num_active_paths': 4, 'use_endpoint': 1, 'rule1_min_trailing_silence': 2.4, 'rule2_min_trailing_silence': 1.2, 'rule3_min_utterance_length': 20, 'port': 6006, 'nn_pool_size': 1, 'max_batch_size': 50, 'max_wait_ms': 10, 'max_message_size': 1048576, 'max_queue_size': 32, 'max_active_connections': 500, 'num_threads': 2, 'certificate': None, 'doc_root': './python-api-examples/web'} + 2023-08-11 16:29:57,476 INFO [streaming_server.py:520] No certificate provided + 2023-08-11 16:29:57,480 INFO [server.py:707] server listening on 0.0.0.0:6006 + 2023-08-11 16:29:57,480 INFO [server.py:707] server listening on [::]:6006 + 2023-08-11 16:29:57,480 INFO [streaming_server.py:546] Please visit one of the following addresses: + + http://localhost:6006 + + Since you are not providing a certificate, you cannot use your microphone from within the browser using public IP addresses. Only localhost can be used.You also cannot use 0.0.0.0 or 127.0.0.1 + +We can use the following two methods to interact with the server: + + - Use Python API + - Use a browser by accessing ``_ + +We describe each method below in details. + +Use Python API +^^^^^^^^^^^^^^ + +We provide two Python example files: + +.. list-table:: + + * - Description + - URL + * - Send a file for decoding + - ``_ + * - Send audio samples from a microphone for decoding + - ``_ + +Send a file for decoding +:::::::::::::::::::::::: + +.. hint:: + + The example file supports only ``*.wav`` files with a single channel + and the each sample should be of type ``int16_t``. The sample rate + does not need to be 16000 Hz, e.g., it can be 48000 Hz, 8000 Hz or some + other value. + +We use the following command to send a file for decoding: + +.. code-block:: + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/online-websocket-client-decode-file.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + +It should give the following output: + +.. code-block:: bash + + 2023-08-11 16:37:03,877 INFO [online-websocket-client-decode-file.py:133] Sending ./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav + 2023-08-11 16:37:03,931 INFO [online-websocket-client-decode-file.py:115] {"text": "", "segment": 0} + 2023-08-11 16:37:04,012 INFO [online-websocket-client-decode-file.py:115] {"text": "", "segment": 0} + 2023-08-11 16:37:04,128 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER", "segment": 0} + 2023-08-11 16:37:04,170 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY", "segment": 0} + 2023-08-11 16:37:04,228 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY", "segment": 0} + 2023-08-11 16:37:04,331 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFA", "segment": 0} + 2023-08-11 16:37:04,373 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE", "segment": 0} + 2023-08-11 16:37:04,433 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LA", "segment": 0} + 2023-08-11 16:37:04,535 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS", "segment": 0} + 2023-08-11 16:37:04,576 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT", "segment": 0} + 2023-08-11 16:37:04,645 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP", "segment": 0} + 2023-08-11 16:37:04,685 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE", "segment": 0} + 2023-08-11 16:37:04,755 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE", "segment": 0} + 2023-08-11 16:37:04,847 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE", "segment": 0} + 2023-08-11 16:37:04,887 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUA", "segment": 0} + 2023-08-11 16:37:04,958 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID", "segment": 0} + 2023-08-11 16:37:05,057 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUAR", "segment": 0} + 2023-08-11 16:37:05,095 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF", "segment": 0} + 2023-08-11 16:37:05,164 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BRO", "segment": 0} + 2023-08-11 16:37:05,268 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHEL", "segment": 0} + 2023-08-11 16:37:05,369 INFO [online-websocket-client-decode-file.py:115] {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS", "segment": 0} + 2023-08-11 16:37:05,370 INFO [online-websocket-client-decode-file.py:154] + Final result is: + {"text": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS", "segment": 0} + +Send audio samples from a microphone for decoding +::::::::::::::::::::::::::::::::::::::::::::::::: + +We use the following command to run the script: + +.. code-block:: + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/online-websocket-client-microphone.py \ + --server-addr localhost \ + --server-port 6006 + +It should give you the following output: + +.. code-block:: bash + + {'server_addr': 'localhost', 'server_port': 6006} + Started! Please Speak + 0 Background Music, Core Audio (2 in, 2 out) + 1 Background Music (UI Sounds), Core Audio (2 in, 2 out) + > 2 MacBook Pro Microphone, Core Audio (1 in, 0 out) + < 3 MacBook Pro Speakers, Core Audio (0 in, 2 out) + 4 WeMeet Audio Device, Core Audio (2 in, 2 out) + Use default device: MacBook Pro Microphone + + Started! Please speak + +If you speak, you will see the recognition result returned by the server. + +Use a browser +^^^^^^^^^^^^^ + +Start your browser and visit the following address: + + ``_ + +You should see a page like below: + +.. image:: ./pic/streaming-1.png + :width: 600 + +Click ``Streaming-Record`` and you will see the following page: + +.. image:: ./pic/streaming-2.png + :width: 600 + +Click the button ``Click me to connect`` to connect to the server and then +you can click the ``Streaming-Record`` button to start recording. You should +see the decoded results as you speak. + + +colab +----- + +We provide a colab notebook +|Sherpa-onnx python streaming websocket example colab notebook| +for you to try the Python streaming websocket server example of `sherpa-onnx`_. + +.. |Sherpa-onnx python streaming websocket example colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_python_streaming_websocket_server.ipynb diff --git a/docs/source/onnx/rknn/index.rst b/docs/source/onnx/rknn/index.rst new file mode 100644 index 000000000..9f2cec8aa --- /dev/null +++ b/docs/source/onnx/rknn/index.rst @@ -0,0 +1,18 @@ +rknn +==== + +This section describes how to use `sherpa-onnx`_ with RKNPU from Rockchip. + +The following boards are known to work: + + - RK3588 + - RK3576 + - RK3568 + - RK3566 + - RK3562 + +.. toctree:: + :maxdepth: 5 + + ./install.rst + ./models.rst diff --git a/docs/source/onnx/rknn/install.rst b/docs/source/onnx/rknn/install.rst new file mode 100644 index 000000000..dbe94c6bc --- /dev/null +++ b/docs/source/onnx/rknn/install.rst @@ -0,0 +1,70 @@ +.. _sherpa-onnx-rknn-install: + +Install +======= + +You can use any methods below to build and install `sherpa-onnx`_ for RKNPU. + +From pre-built wheels using pip install +--------------------------------------- + +You can find pre-built ``whl`` files at ``_. + +To install it, you can use: + +.. code-block:: bash + + pip install sherpa-onnx -f https://k2-fsa.github.io/sherpa/onnx/rk-npu.html + + # 中国用户 + pip install sherpa-onnx -f https://k2-fsa.github.io/sherpa/onnx/rk-npu-cn.html + +Build sherpa-onnx directly on your board +---------------------------------------- + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + mkdir build + cd build + + cmake \ + -DSHERPA_ONNX_ENABLE_RKNN=ON \ + -DCMAKE_INSTALL_PREFIX=./install \ + .. + + make + make install + +Cross-compiling +--------------- + +Please first refer to :ref:`sherpa-onnx-linux-aarch64-cross-compiling` +to install toolchains. + +.. warning:: + + The toolchains for dynamic linking and static linking are different. + +After installing a toolchain by following :ref:`sherpa-onnx-linux-aarch64-cross-compiling` + +Dynamic link +~~~~~~~~~~~~ + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + export BUILD_SHARED_LIBS=ON + ./build-rknn-linux-aarch64.sh + +Static link +~~~~~~~~~~~ + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + export BUILD_SHARED_LIBS=OFF + ./build-rknn-linux-aarch64.sh diff --git a/docs/source/onnx/rknn/models.rst b/docs/source/onnx/rknn/models.rst new file mode 100644 index 000000000..dcb4b8bda --- /dev/null +++ b/docs/source/onnx/rknn/models.rst @@ -0,0 +1,276 @@ +Pre-trained models +================== + + +You can download pre-trained models for RKNPU from ``_. + +In the following, we use models for ``rk3588`` as an example. You can replace +``rk3588`` with ``rk3576``, ``rk3568``, ``rk3566`` or ``rk3562``. + + +Before you continue, we assume you have followed :ref:`sherpa-onnx-rknn-install` +to install `sherpa-onnx`_. The following is an example of installing +`sherpa-onnx`_ with RKNN support on OrangePi 5 max. + +.. code-block:: + + (py310) orangepi@orangepi5max:~/t$ uname -a + Linux orangepi5max 6.1.43-rockchip-rk3588 #1.0.0 SMP Mon Jul 8 11:54:40 CST 2024 aarch64 aarch64 aarch64 GNU/Linux + (py310) orangepi@orangepi5max:~/t$ ls -lh sherpa_onnx-1.11.2-cp310-cp310-linux_aarch64.whl + -rw-r--r-- 1 orangepi orangepi 17M Mar 8 00:20 sherpa_onnx-1.11.2-cp310-cp310-linux_aarch64.whl + (py310) orangepi@orangepi5max:~/t$ pip install ./sherpa_onnx-1.11.2-cp310-cp310-linux_aarch64.whl + Processing ./sherpa_onnx-1.11.2-cp310-cp310-linux_aarch64.whl + Installing collected packages: sherpa-onnx + Successfully installed sherpa-onnx-1.11.2 + + (py310) orangepi@orangepi5max:~/t$ which sherpa-onnx + /home/orangepi/py310/bin/sherpa-onnx + + (py310) orangepi@orangepi5max:~/t$ ldd $(which sherpa-onnx) + linux-vdso.so.1 (0x0000007f9fd93000) + librknnrt.so => /lib/librknnrt.so (0x0000007f9f480000) + libonnxruntime.so => /home/orangepi/py310/bin/../lib/python3.10/site-packages/sherpa_onnx/lib/libonnxruntime.so (0x0000007f9e7f0000) + libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000007f9e750000) + libstdc++.so.6 => /lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000007f9e520000) + libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000007f9e4f0000) + libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000007f9e340000) + /lib/ld-linux-aarch64.so.1 (0x0000007f9fd5a000) + libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000007f9e320000) + libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x0000007f9e300000) + librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000007f9e2e0000) + + (py310) orangepi@orangepi5max:~/t$ strings /lib/librknnrt.so | grep "librknnrt version" + librknnrt version: 2.1.0 (967d001cc8@2024-08-07T19:28:19) + + +sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16 +----------------------------------------------------------------------- + +This model is converted from :ref:`sherpa_onnx_streaming_zipformer_small_bilingual_zh_en_2023_02_16`. + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar.bz2 + tar xvf sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar. + rm sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16.tar. + +After downloading, you can check the file size:: + + ls -lh sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/ + total 58M + -rw-r--r-- 1 orangepi orangepi 7.7M Mar 19 2025 decoder.rknn + -rw-r--r-- 1 orangepi orangepi 44M Mar 19 2025 encoder.rknn + -rw-r--r-- 1 orangepi orangepi 6.2M Mar 19 2025 joiner.rknn + drwxr-xr-x 2 orangepi orangepi 4.0K Mar 19 2025 test_wavs + -rw-r--r-- 1 orangepi orangepi 55K Mar 19 2025 tokens.txt + +Decode files +~~~~~~~~~~~~ + +You can use the following command to decode files with the downloaded model files:: + + sherpa-onnx \ + --provider=rknn \ + --encoder=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder.rknn \ + --decoder=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder.rknn \ + --joiner=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner.rknn \ + --tokens=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + ./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/4.wav + +The output is given below:: + + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0, normalize_samples=True, snip_edges=False), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder.rknn", decoder="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder.rknn", joiner="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner.rknn"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), nemo_ctc=OnlineNeMoCtcModelConfig(model=""), provider_config=ProviderConfig(device=0, provider="rknn", cuda_config=CudaConfig(cudnn_conv_algo_search=1), trt_config=TensorrtConfig(trt_max_workspace_size=2147483647, trt_max_partition_iterations=10, trt_min_subgraph_size=5, trt_fp16_enable="True", trt_detailed_build_log="False", trt_engine_cache_enable="True", trt_engine_cache_path=".", trt_timing_cache_enable="True", trt_timing_cache_path=".",trt_dump_subgraphs="False" )), tokens="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt", num_threads=1, warm_up=0, debug=False, model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OnlineLMConfig(model="", scale=0.5, shallow_fusion=True), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), ctc_fst_decoder_config=OnlineCtcFstDecoderConfig(graph="", max_active=3000), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0, temperature_scale=2, rule_fsts="", rule_fars="") + ./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/test_wavs/4.wav + Number of threads: 1, Elapsed seconds: 3.5, Audio duration (s): 18, Real time factor (RTF) = 3.5/18 = 0.2 + 嗯 ON TIME比较准时 IN TIME是及时叫他总是准时教他的作业那用一般现在时是没有什么感情色彩的陈述一个事实下一句话为什么要用现在进行时它的意思并不是说说他现在正在教他的 + { "text": "嗯 ON TIME比较准时 IN TIME是及时叫他总是准时教他的作业那用一般现在时是没有什么感情色彩的陈述一个事实下一句话为什么要用现在进行时它的意思并不是说说他现在正在教他的", "tokens": ["嗯", " ON", " TIME", "比", "较", "准", "时", " IN", " TIME", "是", "及", "时", "叫", "他", "总", "是", "准", "时", "教", "他", "的", "作", "业", "那", "用", "一", "般", "现", "在", "时", "是", "没", "有", "什", "么", "感", "情", "色", "彩", "的", "陈", "述", "一", "个", "事", "实", "下", "一", "句", "话", "为", "什", "么", "要", "用", "现", "在", "进", "行", "时", "它", "的", "意", "思", "并", "不", "是", "说", "说", "他", "现", "在", "正", "在", "教", "他", "的"], "timestamps": [0.00, 0.64, 0.80, 1.12, 1.16, 1.36, 1.64, 2.00, 2.16, 2.52, 2.80, 2.92, 3.28, 3.64, 3.92, 4.16, 4.48, 4.60, 4.84, 5.12, 5.28, 5.52, 5.72, 6.20, 6.52, 6.80, 7.04, 7.28, 7.52, 7.72, 7.84, 8.08, 8.24, 8.40, 8.44, 8.68, 8.92, 9.00, 9.24, 9.48, 9.80, 9.92, 10.16, 10.32, 10.56, 10.80, 11.52, 11.60, 11.80, 11.96, 12.20, 12.32, 12.40, 12.56, 12.80, 13.12, 13.32, 13.56, 13.76, 13.92, 14.24, 14.36, 14.52, 14.68, 14.92, 15.04, 15.16, 15.32, 15.72, 16.12, 16.36, 16.48, 16.68, 16.88, 17.08, 17.24, 17.84], "ys_probs": [], "lm_probs": [], "context_scores": [], "segment": 0, "words": [], "start_time": 0.00, "is_final": false} + +.. hint:: + + If you get the following errors:: + + E RKNN: [01:24:27.170] 6, 1 + E RKNN: [01:24:27.170] Invalid RKNN model version 6 + E RKNN: [01:24:27.171] rknn_init, load model failed! + /home/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/rknn/online-zipformer-transducer-model-rknn.cc:InitEncoder:330 Return code is: -1 + /home/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/rknn/online-zipformer-transducer-model-rknn.cc:InitEncoder:330 Failed to init encoder './sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder.rknn' + + Please update your ``/lib/librknnrt.so`` or ``/usr/lib/librknnrt.so`` with the + one from ``_. + + Note that you can locate where your ``librknnrt.so`` is by:: + + ldd $(which sherpa-onnx) + +.. note:: + + You can use:: + + watch -n 0.5 cat /sys/kernel/debug/rknpu/load + + to watch the usage of NPU. + + For the RK3588 board, you can use: + + - ``--num-threads=1`` to select ``RKNN_NPU_CORE_AUTO`` + - ``--num-threads=0`` to select ``RKNN_NPU_CORE_0`` + - ``--num-threads=-1`` to select ``RKNN_NPU_CORE_1`` + - ``--num-threads=-2`` to select ``RKNN_NPU_CORE_2`` + - ``--num-threads=-3`` to select ``RKNN_NPU_CORE_0_1`` + - ``--num-threads=-4`` to select ``RKNN_NPU_CORE_0_1_2`` + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First, we need to get the name of the microphone on the board:: + + arecord -l + **** List of CAPTURE Hardware Devices **** + card 2: rockchipes8388 [rockchip,es8388], device 0: dailink-multicodecs ES8323 HiFi-0 [dailink-multicodecs ES8323 HiFi-0] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +We will use ``card 3`` ``device 0``, so the name is ``plughw:3,0``. + +.. code-block:: + + sherpa-onnx-alsa \ + --provider=rknn \ + --encoder=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder.rknn \ + --decoder=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder.rknn \ + --joiner=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner.rknn \ + --tokens=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt \ + plughw:3,0 + +You should see the following output:: + + /home/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 sherpa-onnx-alsa --provider=rknn --encoder=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder.rknn --decoder=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder.rknn --joiner=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner.rknn --tokens=./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt plughw:3,0 + + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0, normalize_samples=True, snip_edges=False), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/encoder.rknn", decoder="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/decoder.rknn", joiner="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/joiner.rknn"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), nemo_ctc=OnlineNeMoCtcModelConfig(model=""), provider_config=ProviderConfig(device=0, provider="rknn", cuda_config=CudaConfig(cudnn_conv_algo_search=1), trt_config=TensorrtConfig(trt_max_workspace_size=2147483647, trt_max_partition_iterations=10, trt_min_subgraph_size=5, trt_fp16_enable="True", trt_detailed_build_log="False", trt_engine_cache_enable="True", trt_engine_cache_path=".", trt_timing_cache_enable="True", trt_timing_cache_path=".",trt_dump_subgraphs="False" )), tokens="./sherpa-onnx-rk3588-streaming-zipformer-small-bilingual-zh-en-2023-02-16/tokens.txt", num_threads=1, warm_up=0, debug=False, model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OnlineLMConfig(model="", scale=0.5, shallow_fusion=True), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), ctc_fst_decoder_config=OnlineCtcFstDecoderConfig(graph="", max_active=3000), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0, temperature_scale=2, rule_fsts="", rule_fars="") + Current sample rate: 16000 + Recording started! + Use recording device: plughw:3,0 + Started! Please speak + 0:这是一个实时的语音识别 + 1:今天是二零二五年三月二十二号 + +sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20 +----------------------------------------------------------------- + +This model is converted from :ref:`sherpa_onnx_streaming_zipformer_small_bilingual_zh_en_2023_02_16`. + +Please use the following commands to download it. + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + +After downloading, you can check the file size:: + + ls -lh sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/ + total 146M + -rw-r--r-- 1 orangepi orangepi 7.7M Mar 19 2025 decoder.rknn + -rw-r--r-- 1 orangepi orangepi 132M Mar 19 2025 encoder.rknn + -rw-r--r-- 1 orangepi orangepi 6.2M Mar 19 2025 joiner.rknn + drwxr-xr-x 2 orangepi orangepi 4.0K Mar 19 2025 test_wavs + -rw-r--r-- 1 orangepi orangepi 55K Mar 19 2025 tokens.txt + +Decode files +~~~~~~~~~~~~ + +You can use the following command to decode files with the downloaded model files:: + + sherpa-onnx \ + --provider=rknn \ + --encoder=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder.rknn \ + --decoder=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder.rknn \ + --joiner=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner.rknn \ + --tokens=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + ./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/4.wav + +The output is given below:: + + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0, normalize_samples=True, snip_edges=False), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder.rknn", decoder="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder.rknn", joiner="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner.rknn"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), nemo_ctc=OnlineNeMoCtcModelConfig(model=""), provider_config=ProviderConfig(device=0, provider="rknn", cuda_config=CudaConfig(cudnn_conv_algo_search=1), trt_config=TensorrtConfig(trt_max_workspace_size=2147483647, trt_max_partition_iterations=10, trt_min_subgraph_size=5, trt_fp16_enable="True", trt_detailed_build_log="False", trt_engine_cache_enable="True", trt_engine_cache_path=".", trt_timing_cache_enable="True", trt_timing_cache_path=".",trt_dump_subgraphs="False" )), tokens="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt", num_threads=1, warm_up=0, debug=False, model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OnlineLMConfig(model="", scale=0.5, shallow_fusion=True), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), ctc_fst_decoder_config=OnlineCtcFstDecoderConfig(graph="", max_active=3000), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0, temperature_scale=2, rule_fsts="", rule_fars="") + ./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/2.wav + Number of threads: 1, Elapsed seconds: 1.8, Audio duration (s): 4.7, Real time factor (RTF) = 1.8/4.7 = 0.38 + 这个是频繁的啊不认识记下来 FREQUENTLY频繁的 + { "text": "这个是频繁的啊不认识记下来 FREQUENTLY频繁的", "tokens": ["这", "个", "是", "频", "繁", "的", "啊", "不", "认", "识", "记", "下", "来", " F", "RE", "QU", "ENT", "LY", "频", "繁", "的"], "timestamps": [0.00, 0.36, 0.52, 0.80, 1.00, 1.16, 1.44, 1.64, 1.92, 2.00, 2.20, 2.36, 2.52, 2.64, 2.88, 2.96, 3.08, 3.32, 3.60, 3.80, 4.40], "ys_probs": [], "lm_probs": [], "context_scores": [], "segment": 0, "words": [], "start_time": 0.00, "is_final": false} + +.. hint:: + + If you get the following errors:: + + E RKNN: [01:24:27.170] 6, 1 + E RKNN: [01:24:27.170] Invalid RKNN model version 6 + E RKNN: [01:24:27.171] rknn_init, load model failed! + /home/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/rknn/online-zipformer-transducer-model-rknn.cc:InitEncoder:330 Return code is: -1 + /home/runner/work/sherpa-onnx/sherpa-onnx/sherpa-onnx/csrc/rknn/online-zipformer-transducer-model-rknn.cc:InitEncoder:330 Failed to init encoder './sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder.rknn' + + Please update your ``/lib/librknnrt.so`` or ``/usr/lib/librknnrt.so`` with the + one from ``_. + + Note that you can locate where your ``librknnrt.so`` is by:: + + ldd $(which sherpa-onnx) + +.. note:: + + You can use:: + + watch -n 0.5 cat /sys/kernel/debug/rknpu/load + + to watch the usage of NPU. + + For the RK3588 board, you can use: + + - ``--num-threads=1`` to select ``RKNN_NPU_CORE_AUTO`` + - ``--num-threads=0`` to select ``RKNN_NPU_CORE_0`` + - ``--num-threads=-1`` to select ``RKNN_NPU_CORE_1`` + - ``--num-threads=-2`` to select ``RKNN_NPU_CORE_2`` + - ``--num-threads=-3`` to select ``RKNN_NPU_CORE_0_1`` + - ``--num-threads=-4`` to select ``RKNN_NPU_CORE_0_1_2`` + +Real-time speech recognition from a microphone +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First, we need to get the name of the microphone on the board:: + + arecord -l + **** List of CAPTURE Hardware Devices **** + card 2: rockchipes8388 [rockchip,es8388], device 0: dailink-multicodecs ES8323 HiFi-0 [dailink-multicodecs ES8323 HiFi-0] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +We will use ``card 3`` ``device 0``, so the name is ``plughw:3,0``. + +.. code-block:: + + sherpa-onnx-alsa \ + --provider=rknn \ + --encoder=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder.rknn \ + --decoder=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder.rknn \ + --joiner=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner.rknn \ + --tokens=./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + plughw:3,0 + +You should see the following output:: + + OnlineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0, normalize_samples=True, snip_edges=False), model_config=OnlineModelConfig(transducer=OnlineTransducerModelConfig(encoder="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder.rknn", decoder="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder.rknn", joiner="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner.rknn"), paraformer=OnlineParaformerModelConfig(encoder="", decoder=""), wenet_ctc=OnlineWenetCtcModelConfig(model="", chunk_size=16, num_left_chunks=4), zipformer2_ctc=OnlineZipformer2CtcModelConfig(model=""), nemo_ctc=OnlineNeMoCtcModelConfig(model=""), provider_config=ProviderConfig(device=0, provider="rknn", cuda_config=CudaConfig(cudnn_conv_algo_search=1), trt_config=TensorrtConfig(trt_max_workspace_size=2147483647, trt_max_partition_iterations=10, trt_min_subgraph_size=5, trt_fp16_enable="True", trt_detailed_build_log="False", trt_engine_cache_enable="True", trt_engine_cache_path=".", trt_timing_cache_enable="True", trt_timing_cache_path=".",trt_dump_subgraphs="False" )), tokens="./sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt", num_threads=1, warm_up=0, debug=False, model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OnlineLMConfig(model="", scale=0.5, shallow_fusion=True), endpoint_config=EndpointConfig(rule1=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=2.4, min_utterance_length=0), rule2=EndpointRule(must_contain_nonsilence=True, min_trailing_silence=1.2, min_utterance_length=0), rule3=EndpointRule(must_contain_nonsilence=False, min_trailing_silence=0, min_utterance_length=20)), ctc_fst_decoder_config=OnlineCtcFstDecoderConfig(graph="", max_active=3000), enable_endpoint=True, max_active_paths=4, hotwords_score=1.5, hotwords_file="", decoding_method="greedy_search", blank_penalty=0, temperature_scale=2, rule_fsts="", rule_fars="") + Current sample rate: 16000 + Recording started! + Use recording device: plughw:3,0 + Started! Please speak + 0:现在开始测试 + 1:现在是星期六 + 2:二零二五年三月二十二号 + 3:下午六点四十四分 diff --git a/docs/source/onnx/sense-voice/c-api.rst b/docs/source/onnx/sense-voice/c-api.rst new file mode 100644 index 000000000..3217e18d1 --- /dev/null +++ b/docs/source/onnx/sense-voice/c-api.rst @@ -0,0 +1,228 @@ +C API for SenseVoice +==================== + +This page describes how to use the C API for `SenseVoice`_. + +Please refer to :ref:`sherpa-onnx-c-api` for how to build `sherpa-onnx`_. + +The following is a very quick introduction for using the C API of `sherpa-onnx`_ +in the form of shared libraries on macOS and Linux. + +.. hint:: + + We do support static libraries and also support Windows. + +If you copy, paste, and run the following commands in your terminal, you should be able +to see the following recognition result: + +.. code-block:: + + Decoded text: The tribal chieftain called for the boy and presented him with 50 pieces of gold. + + +.. code-block:: bash + + cd /tmp + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + + echo "---" + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs + + mkdir build + cd build + cmake \ + -D CMAKE_BUILD_TYPE=Release \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_INSTALL_PREFIX=./install \ + -D SHERPA_ONNX_ENABLE_BINARY=OFF \ + .. + + make -j2 install + + ls -lh install/lib + ls -lh install/include + + cd .. + + gcc -o sense-voice-c-api ./c-api-examples/sense-voice-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh sense-voice-c-api + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./sense-voice-c-api + +Note that we have hard-coded the file paths inside `sense-voice-c-api.c `_ + +.. hint:: + + Since we are using shared libraries in the above example, you have to set + the environemnt variable ``LD_LIBRARY_PATH`` for Linux and ``DYLD_LIBRARY_PATH`` + for macOS. Otherwise, you would get runtime errors when running ``./sense-voice-c-api``. + +Explanations +------------ + +1. Download `sherpa-onnx`_ +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd /tmp + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + +In this example, we download `sherpa-onnx`_ and place it inside the directory +``/tmp/``. You can replace ``/tmp/`` with any directory you like. + +Please always download the latest master of `sherpa-onnx`_. + +2. Download the model +^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +Note that we have placed the model in the directory ``/tmp/sherpa-onnx``. + +3. Build sherpa-onnx +^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + mkdir build + cd build + cmake \ + -D CMAKE_BUILD_TYPE=Release \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_INSTALL_PREFIX=./install \ + -D SHERPA_ONNX_ENABLE_BINARY=OFF \ + .. + + make -j2 install + +We build a Release version of `sherpa-onnx`_. Also, we use shared libraries here. +The header file ``c-api.h`` and shared libraries are installed into the directory +``./build/install``. + +If you are using Linux, you should see the following content:: + + Install the project... + -- Install configuration: "Release" + -- Installing: /tmp/sherpa-onnx/build/install/lib/libonnxruntime.so + -- Installing: /tmp/sherpa-onnx/build/install/./sherpa-onnx.pc + -- Installing: /tmp/sherpa-onnx/build/install/lib/libsherpa-onnx-c-api.so + -- Set non-toolchain portion of runtime path of "/tmp/sherpa-onnx/build/install/lib/libsherpa-onnx-c-api.so" to "$ORIGIN" + -- Installing: /tmp/sherpa-onnx/build/install/include/sherpa-onnx/c-api/c-api.h + +If you are using macOS, you should see:: + + Install the project... + -- Install configuration: "Release" + -- Installing: /tmp/sherpa-onnx/build/install/lib/libonnxruntime.1.17.1.dylib + -- Installing: /tmp/sherpa-onnx/build/install/lib/libonnxruntime.dylib + -- Installing: /tmp/sherpa-onnx/build/install/./sherpa-onnx.pc + -- Installing: /tmp/sherpa-onnx/build/install/lib/libsherpa-onnx-c-api.dylib + -- Installing: /tmp/sherpa-onnx/build/install/include/sherpa-onnx/c-api/c-api.h + +4. View the build result +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ls -lh install/lib + ls -lh install/include + +If you are using Linux, you should see the following content:: + + total 19M + -rw-r--r-- 1 runner docker 15M Jul 22 08:47 libonnxruntime.so + -rw-r--r-- 1 runner docker 4.1M Jul 22 08:47 libsherpa-onnx-c-api.so + drwxr-xr-x 2 runner docker 4.0K Jul 22 08:47 pkgconfig + total 4.0K + drwxr-xr-x 3 runner docker 4.0K Jul 22 08:47 sherpa-onnx + +If you are using macOS, you should see the following content:: + + total 53976 + -rw-r--r-- 1 runner staff 23M Jul 22 08:48 libonnxruntime.1.17.1.dylib + lrwxr-xr-x 1 runner staff 27B Jul 22 08:48 libonnxruntime.dylib -> libonnxruntime.1.17.1.dylib + -rwxr-xr-x 1 runner staff 3.5M Jul 22 08:48 libsherpa-onnx-c-api.dylib + drwxr-xr-x 3 runner staff 96B Jul 22 08:48 pkgconfig + total 0 + drwxr-xr-x 3 runner staff 96B Jul 22 08:48 sherpa-onnx + + +5. Build the C API example +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd .. + + gcc -o sense-voice-c-api ./c-api-examples/sense-voice-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api + + ls -lh sense-voice-c-api + +Note that: + + - ``-I ./build/install/include`` is to add the directory ``./build/install/include`` + to the header search path so that ``#include "sherpa-onnx/c-api/c-api.h`` won't throw an error. + - ``-L ./build/install/lib/`` is to add the directory ``./build/install/lib`` + to the library search path so that it can find ``-l sherpa-onnx-c-api`` + - ``-l sherpa-onnx-c-api`` is to link the library ``libsherpa-onnx-c-api.so`` for Linux + and ``libsherpa-onnx-c-api.dylib`` for macOS. + +6. Run it +^^^^^^^^^ + +.. code-block:: bash + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./sense-voice-c-api + +Note that we have to use:: + + # For Linux + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + +and:: + + # for macOS + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + +Otherwise, it cannot find ``libsherpa-onnx-c-api.so`` for Linux +and ``libsherpa-onnx-c-api.dylib`` at ``runtime``. + +7. Where to find sense-voice-c-api.c +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can find ``sense-voice-c-api.c`` at the following address: + + ``_ + diff --git a/docs/source/onnx/sense-voice/code/2024-07-17-itn.txt b/docs/source/onnx/sense-voice/code/2024-07-17-itn.txt new file mode 100644 index 000000000..eca10eaae --- /dev/null +++ b/docs/source/onnx/sense-voice/code/2024-07-17-itn.txt @@ -0,0 +1,17 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx --num-threads=1 --sense-voice-use-itn=1 --debug=0 ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx", language="auto", use_itn=True), telespeech_ctc="", tokens="./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav +{"text": "开放时间早上9点至下午5点。", "timestamps": [0.72, 0.96, 1.26, 1.44, 1.92, 2.10, 2.58, 2.82, 3.30, 3.90, 4.20, 4.56, 4.74, 5.46], "tokens":["开", "放", "时", "间", "早", "上", "9", "点", "至", "下", "午", "5", "点", "。"], "words": []} +---- +./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav +{"text": "The tribal chieftain called for the boy and presented him with 50 pieces of gold.", "timestamps": [0.90, 1.26, 1.56, 1.80, 2.16, 2.46, 2.76, 2.94, 3.12, 3.60, 3.96, 4.50, 4.74, 4.92, 5.10, 5.28, 5.52, 5.88, 6.18, 7.02], "tokens":["The", " tri", "bal", " chief", "tain", " called", " for", " the", " boy", " and", " presented", " him", " with", " ", "5", "0", " pieces", " of", " gold", "."], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 1.543 s +Real time factor (RTF): 1.543 / 12.744 = 0.121 diff --git a/docs/source/onnx/sense-voice/code/2024-07-17-lang.txt b/docs/source/onnx/sense-voice/code/2024-07-17-lang.txt new file mode 100644 index 000000000..5a758a7f3 --- /dev/null +++ b/docs/source/onnx/sense-voice/code/2024-07-17-lang.txt @@ -0,0 +1,14 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx --num-threads=1 --sense-voice-language=zh --debug=0 ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx", language="zh", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav +{"text": "开饭时间早上九点至下午五点", "timestamps": [0.72, 0.96, 1.26, 1.44, 1.92, 2.10, 2.58, 2.82, 3.30, 3.90, 4.20, 4.56, 4.74], "tokens":["开", "饭", "时", "间", "早", "上", "九", "点", "至", "下", "午", "五", "点"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 0.625 s +Real time factor (RTF): 0.625 / 5.592 = 0.112 diff --git a/docs/source/onnx/sense-voice/code/2024-07-17.txt b/docs/source/onnx/sense-voice/code/2024-07-17.txt new file mode 100644 index 000000000..fe1121068 --- /dev/null +++ b/docs/source/onnx/sense-voice/code/2024-07-17.txt @@ -0,0 +1,17 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx --num-threads=1 --debug=0 ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + +OfflineRecognizerConfig(feat_config=FeatureExtractorConfig(sampling_rate=16000, feature_dim=80, low_freq=20, high_freq=-400, dither=0), model_config=OfflineModelConfig(transducer=OfflineTransducerModelConfig(encoder_filename="", decoder_filename="", joiner_filename=""), paraformer=OfflineParaformerModelConfig(model=""), nemo_ctc=OfflineNemoEncDecCtcModelConfig(model=""), whisper=OfflineWhisperModelConfig(encoder="", decoder="", language="", task="transcribe", tail_paddings=-1), tdnn=OfflineTdnnModelConfig(model=""), zipformer_ctc=OfflineZipformerCtcModelConfig(model=""), wenet_ctc=OfflineWenetCtcModelConfig(model=""), sense_voice=OfflineSenseVoiceModelConfig(model="./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx", language="auto", use_itn=False), telespeech_ctc="", tokens="./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt", num_threads=1, debug=False, provider="cpu", model_type="", modeling_unit="cjkchar", bpe_vocab=""), lm_config=OfflineLMConfig(model="", scale=0.5), ctc_fst_decoder_config=OfflineCtcFstDecoderConfig(graph="", max_active=3000), decoding_method="greedy_search", max_active_paths=4, hotwords_file="", hotwords_score=1.5, blank_penalty=0, rule_fsts="", rule_fars="") +Creating recognizer ... +Started +Done! + +./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav +{"text": "开饭时间早上九点至下午五点", "timestamps": [0.72, 0.96, 1.26, 1.44, 1.92, 2.10, 2.58, 2.82, 3.30, 3.90, 4.20, 4.56, 4.74], "tokens":["开", "饭", "时", "间", "早", "上", "九", "点", "至", "下", "午", "五", "点"], "words": []} +---- +./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav +{"text": "the tribal chieftain called for the boy and presented him with fifty pieces of gold", "timestamps": [0.90, 1.26, 1.56, 1.80, 2.16, 2.46, 2.76, 2.94, 3.12, 3.60, 3.96, 4.50, 4.74, 5.10, 5.52, 5.88, 6.18], "tokens":["the", " tri", "bal", " chief", "tain", " called", " for", " the", " boy", " and", " presented", " him", " with", " fifty", " pieces", " of", " gold"], "words": []} +---- +num threads: 1 +decoding method: greedy_search +Elapsed seconds: 2.320 s +Real time factor (RTF): 2.320 / 12.744 = 0.182 diff --git a/docs/source/onnx/sense-voice/code/Obama.srt b/docs/source/onnx/sense-voice/code/Obama.srt new file mode 100644 index 000000000..208b9f1d3 --- /dev/null +++ b/docs/source/onnx/sense-voice/code/Obama.srt @@ -0,0 +1,184 @@ +1 +0:00:09,286 --> 0:00:12,486 +Everybody, all right, everybody, go ahead and have a seat. + +2 +0:00:13,094 --> 0:00:15,014 +How's everybody doing today? + +3 +0:00:18,694 --> 0:00:20,742 +How about Tim Sper. + +4 +0:00:25,894 --> 0:00:31,942 +I am here with students at Wakefield High School in Arlington, Virginia. + +5 +0:00:32,710 --> 0:00:48,326 +And we've got students tuning in from all across America, from kindergarten through 12th grade. And I am just so glad that all could join us today. And I want to thank Wakefield for being such an outstanding host. Give yourselves a big line. + +6 +0:00:54,406 --> 0:00:59,238 +Now I know that for many of you, today is the first day of school. + +7 +0:00:59,590 --> 0:01:09,798 +And for those of you in kindergarten or starting middle or high school, it's your first day in a new school, so it's understandable if you're a little nervous. + +8 +0:01:10,630 --> 0:01:16,006 +I imagine there's some seniors out there who are feeling pretty good right now with just one more year to go. + +9 +0:01:18,790 --> 0:01:27,142 +And no matter what grade you're in, some of you are probably wishing it we're still summer, and you could have stayed in bed just a little bit longer this morning. + +10 +0:01:27,942 --> 0:01:29,414 +I know that field. + +11 +0:01:31,654 --> 0:01:51,708 +When I was young, my family lived overseas, I lived in Indonesia for a few years and my mother, she didn't have the money to send me where all the American kids went to school, but she thought it was important for me to keep up with American education, so she decided to teach me extra lessons herself. + +12 +0:01:52,230 --> 0:01:58,790 +Monday through Friday, but because she had to go to work, the only time she could do it was at 430 in the morning. + +13 +0:02:00,038 --> 0:02:03,750 +Now as you might imagine, I wasn't too happy about getting up that early. + +14 +0:02:04,102 --> 0:02:07,302 +A lot of times I'd fall asleep right there at the kitchen table. + +15 +0:02:08,262 --> 0:02:15,014 +But whenever I'd complained, my mother would just give me one of those looks and she'd say, this is no picnic for me either, Buster. + +16 +0:02:17,094 --> 0:02:25,382 +So I know that some of you are still adjusting to being back at school, but I'm here today because I have something important to discuss with you. + +17 +0:02:25,798 --> 0:02:33,798 +I'm here because I want to talk with you about your education and what's expected of all of you in this new school year. + +18 +0:02:34,470 --> 0:02:40,422 +I've given a lot of speeches about education, and I've talked about responsibility a lot. + +19 +0:02:40,806 --> 0:02:47,174 +I've talked about teachers responsibility for inspiring students and pushing you to learn. + +20 +0:02:47,430 --> 0:02:58,726 +I talked about your parents' responsibility for making sure you stay on track and you get your homework done and don't spend every waking hour in front of the TV or with the Xbox. + +21 +0:02:59,078 --> 0:03:00,774 +I've talked a lot about. + +22 +0:03:01,350 --> 0:03:13,286 +Your government's responsibility for setting high standards and supporting teachers and principals and turning around schools that aren't working where students aren't getting the opportunities that they deserve. + +23 +0:03:13,990 --> 0:03:15,366 +But at the end of the day. + +24 +0:03:16,006 --> 0:03:26,054 +We can have the most dedicated teachers, the most supportive parents, the best schools in the world, and none of it will make a difference, none of it will matter. + +25 +0:03:26,694 --> 0:03:30,694 +Unless all of you fulfill your responsibilities. + +26 +0:03:31,238 --> 0:03:43,814 +Unless you show up to those schools, unless you pay attention to those teachers, unless you listen to your parents and grandparents and other adults and put in the hard work it takes to succeed. + +27 +0:03:44,646 --> 0:03:46,598 +That's what I want to focus on today. + +28 +0:03:47,110 --> 0:03:50,918 +The responsibility each of you has for your education. + +29 +0:03:51,718 --> 0:03:54,854 +I want to start with the responsibility you have to yourself. + +30 +0:03:55,654 --> 0:03:59,078 +Every single one of you has something that you're good at. + +31 +0:03:59,782 --> 0:04:02,406 +Every single one of you has something to offer. + +32 +0:04:02,982 --> 0:04:07,590 +And you have a responsibility to yourself to discover what that is. + +33 +0:04:08,326 --> 0:04:11,494 +That's the opportunity an education can provide. + +34 +0:04:12,326 --> 0:04:22,598 +Maybe you could be a great writer, maybe even good enough to write a book or articles in a newspaper, but you might not know it until you write that English paper. + +35 +0:04:23,078 --> 0:04:25,894 +That English class paper that's assigned to you. + +36 +0:04:26,694 --> 0:04:38,726 +Maybe you could be an innovator or an inventor, maybe even good enough to come up with the next iPhone or the new medicine or a vaccine, but you might not know it until you do your project for your science class. + +37 +0:04:39,814 --> 0:04:44,838 +Maybe you could be a mayor or a senator or a Supreme Court justice. + +38 +0:04:45,350 --> 0:04:50,182 +But you might not know that until you join student government or the debate team. + +39 +0:04:51,558 --> 0:04:56,774 +And no matter what you want to do with your life, I guarantee that you'll need an education to do it. + +40 +0:04:57,318 --> 0:05:00,710 +You want to be a doctor or a teacher or a police officer? + +41 +0:05:00,998 --> 0:05:09,702 +You want to be a nurse or an architect, a lawyer or a member of our military, you're going to need a good education for every single one of those careers. + +42 +0:05:10,054 --> 0:05:14,278 +You cannot drop out of school and just drop into a good job. + +43 +0:05:15,174 --> 0:05:19,846 +You've got to train for it and work for it and learn for it. + +44 +0:05:20,518 --> 0:05:23,654 +And this isn't just important for your own life and your own future. + +45 +0:05:24,678 --> 0:05:29,670 +What you make of your education will decide nothing less than the future of this country. + +46 +0:05:29,958 --> 0:05:32,998 +The future of America depends on you. + diff --git a/docs/source/onnx/sense-voice/code/lei-jun-test.srt b/docs/source/onnx/sense-voice/code/lei-jun-test.srt new file mode 100644 index 000000000..c48d0eae1 --- /dev/null +++ b/docs/source/onnx/sense-voice/code/lei-jun-test.srt @@ -0,0 +1,280 @@ +1 +0:00:28,934 --> 0:00:36,006 +朋友们晚上好,欢迎大家来参加今天晚上的活动,谢谢大家。 + +2 +0:00:42,118 --> 0:00:46,374 +这是我第四次颁年度演讲。 + +3 +0:00:46,918 --> 0:00:50,118 +前三次呢,因为疫情的原因。 + +4 +0:00:50,406 --> 0:00:55,750 +都在小米科技园内举办,现场的人很少。 + +5 +0:00:56,134 --> 0:00:57,574 +这是第四次。 + +6 +0:00:58,182 --> 0:01:06,854 +我们仔细想了想,我们还是想办一个比较大的聚会,然后呢让我们的新朋友老朋友一起聚一聚。 + +7 +0:01:07,718 --> 0:01:10,886 +今天的话呢我们就在北京的。 + +8 +0:01:11,654 --> 0:01:15,142 +国家会议中心呢举办了这么一个活动。 + +9 +0:01:15,430 --> 0:01:19,526 +现场呢来了很多人,大概有3500人。 + +10 +0:01:19,942 --> 0:01:22,278 +还有很多很多的朋友呢。 + +11 +0:01:22,694 --> 0:01:25,798 +通过观看直播的方式来参与。 + +12 +0:01:26,342 --> 0:01:30,886 +再一次呢对大家的参加表示感谢,谢谢大家。 + +13 +0:01:38,470 --> 0:01:39,910 +两个月前。 + +14 +0:01:40,358 --> 0:01:44,486 +我参加了今年武汉大学的毕业典礼。 + +15 +0:01:45,926 --> 0:01:47,334 +今年呢是。 + +16 +0:01:47,910 --> 0:01:50,694 +武汉大学建校130周年。 + +17 +0:01:51,750 --> 0:01:52,838 +作为校友。 + +18 +0:01:53,350 --> 0:01:54,886 +被母校邀请。 + +19 +0:01:55,206 --> 0:01:57,222 +在毕业典礼上致辞。 + +20 +0:01:58,054 --> 0:01:59,558 +这对我来说。 + +21 +0:01:59,814 --> 0:02:02,598 +是至高无上的荣誉。 + +22 +0:02:03,654 --> 0:02:05,670 +站在讲台的那一刻。 + +23 +0:02:06,246 --> 0:02:08,614 +面对全校师生。 + +24 +0:02:09,190 --> 0:02:11,462 +关于武大的所有的记忆。 + +25 +0:02:11,686 --> 0:02:14,182 +一下子涌现在脑海里。 + +26 +0:02:14,982 --> 0:02:17,670 +今天呢我就先和大家聊聊。 + +27 +0:02:18,278 --> 0:02:19,494 +大往事。 + +28 +0:02:21,830 --> 0:02:23,814 +那还是36年前。 + +29 +0:02:25,926 --> 0:02:27,654 +1987年。 + +30 +0:02:28,678 --> 0:02:31,622 +我呢考上了武汉大学的计算机系。 + +31 +0:02:32,678 --> 0:02:35,174 +在武汉大学的图书馆里。 + +32 +0:02:35,398 --> 0:02:36,710 +看了一本书。 + +33 +0:02:37,574 --> 0:02:38,630 +硅谷之火。 + +34 +0:02:39,334 --> 0:02:41,638 +建立了我一生的梦想。 + +35 +0:02:43,302 --> 0:02:44,454 +看完书以后。 + +36 +0:02:45,286 --> 0:02:46,438 +热血沸腾。 + +37 +0:02:47,590 --> 0:02:49,318 +激动的睡不着觉。 + +38 +0:02:50,406 --> 0:02:51,238 +我还记得。 + +39 +0:02:52,006 --> 0:02:52,966 +那天晚上。 + +40 +0:02:53,318 --> 0:02:54,662 +星光很亮。 + +41 +0:02:55,398 --> 0:02:57,670 +我就在五大的操场上。 + +42 +0:02:58,342 --> 0:02:59,782 +就是屏幕上这个超场。 + +43 +0:03:00,774 --> 0:03:02,470 +走了一圈又一圈。 + +44 +0:03:02,950 --> 0:03:05,222 +走了整整一个晚上。 + +45 +0:03:06,470 --> 0:03:07,750 +我心里有团火。 + +46 +0:03:08,934 --> 0:03:10,310 +我也想搬一个。 + +47 +0:03:10,598 --> 0:03:11,814 +伟大的公司。 + +48 +0:03:13,958 --> 0:03:14,822 +就是这样。 + +49 +0:03:17,606 --> 0:03:18,822 +梦想之火。 + +50 +0:03:19,270 --> 0:03:22,502 +在我心里彻底点燃了。 + +51 +0:03:29,766 --> 0:03:30,534 +但是。 + +52 +0:03:30,758 --> 0:03:32,550 +一个大一的新生。 + +53 +0:03:40,326 --> 0:03:42,726 +是一个大一的新生。 + +54 +0:03:43,814 --> 0:03:47,046 +一个从县城里出来的年轻人。 + +55 +0:03:48,134 --> 0:03:50,630 +什么也不会,什么也没有。 + +56 +0:03:51,526 --> 0:03:56,326 +就想创办一家伟大的公司,这不就是天方夜谭吗? + +57 +0:03:57,574 --> 0:04:00,102 +这么离谱的一个梦想。 + +58 +0:04:00,358 --> 0:04:02,278 +该如何实现呢? + +59 +0:04:03,846 --> 0:04:04,934 +那天晚上。 + +60 +0:04:05,190 --> 0:04:06,918 +我想了一整晚上。 + +61 +0:04:07,974 --> 0:04:08,966 +说实话。 + +62 +0:04:10,342 --> 0:04:13,798 +越想越糊涂,完全理不清头绪。 + +63 +0:04:14,982 --> 0:04:16,102 +后来我在想。 + +64 +0:04:16,774 --> 0:04:18,022 +干脆别想了。 + +65 +0:04:18,342 --> 0:04:19,878 +把书练好。 + +66 +0:04:20,422 --> 0:04:21,382 +是正慑。 + +67 +0:04:22,150 --> 0:04:22,982 +所以呢。 + +68 +0:04:23,366 --> 0:04:25,670 +我就下定决心认认真真读书。 + +69 +0:04:26,662 --> 0:04:27,174 +那么。 + +70 +0:04:28,486 --> 0:04:31,398 +我怎么能够把书读的不同凡响呢? + diff --git a/docs/source/onnx/sense-voice/dart-api.rst b/docs/source/onnx/sense-voice/dart-api.rst new file mode 100644 index 000000000..77b5b97bd --- /dev/null +++ b/docs/source/onnx/sense-voice/dart-api.rst @@ -0,0 +1,105 @@ +Dart API for SenseVoice +======================= + +This page describes how to use the Dart API to run `SenseVoice`_ models +in `sherpa-onnx`_ + +Note that we have published the package ``sherpa_onnx`` at ``_. + +.. figure:: ./pic/pub-dev.png + :alt: screenshot of the sherpa-onnx package on pub.dev + :align: center + :width: 600 + + Screenshot of `sherpa-onnx`_ on ``pub.dev``. + +Note that the package supports the following platforms: + + - Android + - iOS + - Linux + - macOS + - Windows + +In the following, we show how to use the pure Dart API to decode files +with `SenseVoice`_ models. + +.. code-block:: bash + + cd /tmp + + git clone http://github.com/k2-fsa/sherpa-onnx + + cd sherpa-onnx + cd dart-api-examples + cd non-streaming-asr + dart pub get + ./run-sense-voice.sh + +You should see the following recognition result: + + 开饭时间早上9点至下午5点。 + +Explanations +------------ + +1. Download the code +^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd /tmp + + git clone http://github.com/k2-fsa/sherpa-onnx + +In this example, we download `sherpa-onnx`_ and place it inside the directory +``/tmp/``. You can replace ``/tmp/`` with any directory you like. + +2. Download the sherpa-onnx package +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + cd sherpa-onnx + cd dart-api-examples + cd non-streaming-asr + dart pub get + +The command ``dart pub get`` will download the ``sherpa_onnx`` package automagically +from ``pub.dev``. + +You should see something like below after running ``dart pub get``:: + + (py38) fangjuns-MacBook-Pro:non-streaming-asr fangjun$ dart pub get + Resolving dependencies... (1.2s) + Downloading packages... (33.3s) + collection 1.18.0 (1.19.0 available) + lints 3.0.0 (4.0.0 available) + material_color_utilities 0.8.0 (0.12.0 available) + meta 1.12.0 (1.15.0 available) + > sherpa_onnx 1.10.17 (was 1.9.29) + + sherpa_onnx_android 1.10.17 + + sherpa_onnx_ios 1.10.17 + + sherpa_onnx_linux 1.10.17 + + sherpa_onnx_macos 1.10.17 + + sherpa_onnx_windows 1.10.17 + Changed 6 dependencies! + 4 packages have newer versions incompatible with dependency constraints. + Try `dart pub outdated` for more information. + +3. Run it +^^^^^^^^^ + +.. code-block:: bash + + ./run-sense-voice.sh + +The above script downloads models and run the code automatically. + +You can find ``run-sense-voice.sh`` at the following address: + + ``_ + +The Dart API example code can be found at: + + ``_ diff --git a/docs/source/onnx/sense-voice/export.rst b/docs/source/onnx/sense-voice/export.rst new file mode 100644 index 000000000..b712e8219 --- /dev/null +++ b/docs/source/onnx/sense-voice/export.rst @@ -0,0 +1,71 @@ +Export SenseVoice to sherpa-onnx +================================ + +This page describes how to export `SenseVoice`_ to onnx so that you can use +it with `sherpa-onnx`_. + + +The code +-------- + +Please refer to `export-onnx.py `_ + +The entry point is `run.sh `_ + +After executing `run.sh `_, you should get +the following files + + - ``model.onnx``, the float32 onnx model + - ``model.int8.onnx``, the 8-bit quantized model + - ``tokens.txt``, for converting integer token IDs to strings + - ``test_wavs/zh.wav``, test wave for Chinese + - ``test_wavs/en.wav``, test wave for English + - ``test_wavs/ko.wav``, test wave for Korean + - ``test_wavs/ja.wav``, test wave for Japanese + - ``test_wavs/yue.wav``, test wave for Cantonese + +Test the exported model +----------------------- + +You can use `test.py `_ +to test the exported model. + +Note that `test.py `_ +does not depend on `sherpa-onnx`_. It uses onnxruntime Python API. + +Where to find exported models +------------------------------ + +You can find the exported `SenseVoice`_ models at + + ``_ + +The following is an example about how to download an exported `SenseVoice`_ model:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + +To view the downloaded files, please use:: + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + + total 1.1G + -rw-r--r-- 1 runner docker 71 Jul 18 13:06 LICENSE + -rw-r--r-- 1 runner docker 104 Jul 18 13:06 README.md + -rwxr-xr-x 1 runner docker 5.8K Jul 18 13:06 export-onnx.py + -rw-r--r-- 1 runner docker 229M Jul 18 13:06 model.int8.onnx + -rw-r--r-- 1 runner docker 895M Jul 18 13:06 model.onnx + drwxr-xr-x 2 runner docker 4.0K Jul 18 13:06 test_wavs + -rw-r--r-- 1 runner docker 309K Jul 18 13:06 tokens.txt + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs + + total 940K + -rw-r--r-- 1 runner docker 224K Jul 18 13:06 en.wav + -rw-r--r-- 1 runner docker 226K Jul 18 13:06 ja.wav + -rw-r--r-- 1 runner docker 145K Jul 18 13:06 ko.wav + -rw-r--r-- 1 runner docker 161K Jul 18 13:06 yue.wav + -rw-r--r-- 1 runner docker 175K Jul 18 13:06 zh.wav diff --git a/docs/source/onnx/sense-voice/huggingface-space.rst b/docs/source/onnx/sense-voice/huggingface-space.rst new file mode 100644 index 000000000..e7d7d73f5 --- /dev/null +++ b/docs/source/onnx/sense-voice/huggingface-space.rst @@ -0,0 +1,20 @@ +Huggingface space +================= + +You can try `SenseVoice`_ with `sherpa-onnx`_ with the following huggingface space + + ``_ + + +.. hint:: + + You don't need to install anything. All you need is a browser. + + You can even run it on your phone or tablet. + +.. figure:: ./pic/sense-voice-hf-space.jpg + :alt: screenshot of hf space for SenseVoice + :align: center + :width: 600 + + Try `SenseVoice`_ in our Huggingface space with `sherpa-onnx`_ diff --git a/docs/source/onnx/sense-voice/index.rst b/docs/source/onnx/sense-voice/index.rst new file mode 100644 index 000000000..0ebe28d11 --- /dev/null +++ b/docs/source/onnx/sense-voice/index.rst @@ -0,0 +1,49 @@ +SenseVoice +========== + +This section describes how to use models from ``_. + +A single model from `SenseVoice`_ supports the following languages + + - Chinese (Mandarin, 普通话) + - Cantonese (粤语, 广东话) + - English + - Japanese + - Korean + +which is similar to what multilingual `Whisper`_ is doing. + +We have converted `SenseVoice`_ to onnx and provided APIs for the following programming languages + + - 1. C++ + - 2. C + - 3. Python + - 4. C# + - 5. Go + - 6. Kotlin + - 7. Java + - 8. JavaScript (Support `WebAssembly`_ and `Node`_) + - 9. Swift + - 10. `Dart`_ (Support `Flutter`_) + +Note that you can use `SenseVoice`_ with `sherpa-onnx`_ on the following platforms: + + - Linux (x64, aarch64, arm, riscv64) + - macOS (x64, arm64) + - Windows (x64, x86, arm64) + - Android (arm64-v8a, armv7-eabi, x86, x86_64) + - iOS (arm64) + +In the following, we describe how to download pre-trained `SenseVoice`_ models +and use them in `sherpa-onnx`_. + + +.. toctree:: + :maxdepth: 5 + + ./huggingface-space.rst + ./export.rst + ./pretrained.rst + ./c-api.rst + ./dart-api.rst + ./python-api.rst diff --git a/docs/source/onnx/sense-voice/pic/pub-dev.png b/docs/source/onnx/sense-voice/pic/pub-dev.png new file mode 100644 index 000000000..27173cb19 Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/pub-dev.png differ diff --git a/docs/source/onnx/sense-voice/pic/python-websocket/client-1.jpg b/docs/source/onnx/sense-voice/pic/python-websocket/client-1.jpg new file mode 100644 index 000000000..76d1707e9 Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/python-websocket/client-1.jpg differ diff --git a/docs/source/onnx/sense-voice/pic/python-websocket/client-2.jpg b/docs/source/onnx/sense-voice/pic/python-websocket/client-2.jpg new file mode 100644 index 000000000..102bfa3b5 Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/python-websocket/client-2.jpg differ diff --git a/docs/source/onnx/sense-voice/pic/python-websocket/client-3.jpg b/docs/source/onnx/sense-voice/pic/python-websocket/client-3.jpg new file mode 100644 index 000000000..cf646ee22 Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/python-websocket/client-3.jpg differ diff --git a/docs/source/onnx/sense-voice/pic/python-websocket/client-4.jpg b/docs/source/onnx/sense-voice/pic/python-websocket/client-4.jpg new file mode 100644 index 000000000..3f5176cc6 Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/python-websocket/client-4.jpg differ diff --git a/docs/source/onnx/sense-voice/pic/python-websocket/client-5.jpg b/docs/source/onnx/sense-voice/pic/python-websocket/client-5.jpg new file mode 100644 index 000000000..df6908baa Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/python-websocket/client-5.jpg differ diff --git a/docs/source/onnx/sense-voice/pic/sense-voice-hf-space.jpg b/docs/source/onnx/sense-voice/pic/sense-voice-hf-space.jpg new file mode 100644 index 000000000..5ebe5b1b7 Binary files /dev/null and b/docs/source/onnx/sense-voice/pic/sense-voice-hf-space.jpg differ diff --git a/docs/source/onnx/sense-voice/pretrained.rst b/docs/source/onnx/sense-voice/pretrained.rst new file mode 100644 index 000000000..fea42d8d9 --- /dev/null +++ b/docs/source/onnx/sense-voice/pretrained.rst @@ -0,0 +1,153 @@ +Pre-trained Models +================== + +This page describes how to download pre-trained `SenseVoice`_ models. + + +.. _sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17: + +sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 +-------------------------------------------------- + +This model is converted from ``_ +using the script `export-onnx.py `_. + +It supports the following 5 languages: + + - Chinese (Mandarin, 普通话) + - Cantonese (粤语, 广东话) + - English + - Japanese + - Korean + +In the following, we describe how to download it. + +Download +^^^^^^^^ + +Please use the following commands to download it:: + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +After downloading, you should find the following files:: + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + + total 1.1G + -rw-r--r-- 1 runner docker 71 Jul 18 13:06 LICENSE + -rw-r--r-- 1 runner docker 104 Jul 18 13:06 README.md + -rwxr-xr-x 1 runner docker 5.8K Jul 18 13:06 export-onnx.py + -rw-r--r-- 1 runner docker 229M Jul 18 13:06 model.int8.onnx + -rw-r--r-- 1 runner docker 895M Jul 18 13:06 model.onnx + drwxr-xr-x 2 runner docker 4.0K Jul 18 13:06 test_wavs + -rw-r--r-- 1 runner docker 309K Jul 18 13:06 tokens.txt + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs + + total 940K + -rw-r--r-- 1 runner docker 224K Jul 18 13:06 en.wav + -rw-r--r-- 1 runner docker 226K Jul 18 13:06 ja.wav + -rw-r--r-- 1 runner docker 145K Jul 18 13:06 ko.wav + -rw-r--r-- 1 runner docker 161K Jul 18 13:06 yue.wav + -rw-r--r-- 1 runner docker 175K Jul 18 13:06 zh.wav + +Decode a file with model.onnx +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Without inverse text normalization +:::::::::::::::::::::::::::::::::: + +To decode a file without inverse text normalization, please use: + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --num-threads=1 \ + --debug=0 \ + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav \ + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + +You should see the following output: + +.. literalinclude:: ./code/2024-07-17.txt + +With inverse text normalization +::::::::::::::::::::::::::::::: + +To decode a file with inverse text normalization, please use: + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --num-threads=1 \ + --sense-voice-use-itn=1 \ + --debug=0 \ + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav \ + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + +You should see the following output: + +.. literalinclude:: ./code/2024-07-17-itn.txt + +.. hint:: + + When inverse text normalziation is enabled, the results also + punctuations. + +Specify a language +:::::::::::::::::: + +If you don't provide a language when decoding, it uses ``auto``. + +To specify the language when decoding, please use: + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --num-threads=1 \ + --sense-voice-language=zh \ + --debug=0 \ + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav + +You should see the following output: + +.. literalinclude:: ./code/2024-07-17-lang.txt + +.. hint:: + + Valid values for ``--sense-voice-language`` are ``auto``, ``zh``, ``en``, ``ko``, ``ja``, and ``yue``. + where ``zh`` is for Chinese, ``en`` for English, ``ko`` for Korean, ``ja`` for Japanese, and + ``yue`` for ``Cantonese``. + + +Speech recognition from a microphone +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ./build/bin/sherpa-onnx-microphone-offline \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx + +Speech recognition from a microphone with VAD +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./build/bin/sherpa-onnx-vad-microphone-offline-asr \ + --silero-vad-model=./silero_vad.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --sense-voice-model=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx diff --git a/docs/source/onnx/sense-voice/python-api.rst b/docs/source/onnx/sense-voice/python-api.rst new file mode 100644 index 000000000..208832ad7 --- /dev/null +++ b/docs/source/onnx/sense-voice/python-api.rst @@ -0,0 +1,449 @@ +Python API for SenseVoice +========================= + +This page describes how to use the Python API for `SenseVoice`_. + +Please refer to :ref:`install_sherpa_onnx_python` for how to install the Python package +of `sherpa-onnx`_. + +The following is a quick way to do that:: + + pip install sherpa-onnx + +Decode a file +------------- + +After installing the Python package, you can download the Python example code and run it with +the following commands:: + + cd /tmp + git clone https://github.com/k2-fsa/sherpa-onnx.git/ + cd sherpa-onnx + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py + + + +You should see something like below:: + + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav + {"text": "开放时间早上9点至下午5点。", "timestamps": [0.72, 0.96, 1.26, 1.44, 1.92, 2.10, 2.58, 2.82, 3.30, 3.90, 4.20, 4.56, 4.74, 5.46], "tokens":["开", "放", "时", "间", "早", "上", "9", "点", "至", "下", "午", "5", "点", "。"], "words": []} + + (py38) fangjuns-MacBook-Pro:sherpa-onnx fangjun$ #python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py + +.. raw:: html + + + + + + + + + + +
Wave filenameContent
zh.wav + +
+ +You can find ``offline-sense-voice-ctc-decode-files.py`` at the following address: + + ``_ + +Speech recognition from a microphone +------------------------------------ + +The following example shows how to use a microphone with `SenseVoice`_ and `silero-vad`_ +for speech recognition:: + + cd /tmp/sherpa-onnx + + # Assuem you have downloaded the SenseVoice model + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + python3 ./python-api-examples/vad-with-non-streaming-asr.py \ + --silero-vad-model=./silero_vad.onnx \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --num-threads=2 + +You should see something like below:: + + 0 Background Music, Core Audio (2 in, 2 out) + 1 Background Music (UI Sounds), Core Audio (2 in, 2 out) + > 2 MacBook Pro Microphone, Core Audio (1 in, 0 out) + < 3 MacBook Pro Speakers, Core Audio (0 in, 2 out) + 4 WeMeet Audio Device, Core Audio (2 in, 2 out) + Use default device: MacBook Pro Microphone + Creating recognizer. Please wait... + Started! Please speak + +If you start speaking, you should see some output after you stop speaking. + +.. hint:: + + It starts speech recognition after `silero-vad`_ detects a pause. + +Generate subtitles +------------------ + +This section describes how to use `SenseVoice`_ and `silero-vad`_ +to generate subtitles. + +Chinese +^^^^^^^ + +Test with a wave file containing Chinese: + +.. code-block:: bash + + cd /tmp/sherpa-onnx + + # Assuem you have downloaded the SenseVoice model + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + + python3 ./python-api-examples/generate-subtitles.py \ + --silero-vad-model=./silero_vad.onnx \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --num-threads=2 \ + ./lei-jun-test.wav + +.. raw:: html + + + + + + + + + + +
Wave filenameContent
lei-jun-test.wav + +
+ +It will generate a text file ``lei-jun-test.srt``, which is given below: + + +.. container:: toggle + + .. container:: header + + Click ▶ to see ``lei-jun-test.srt``. + + .. literalinclude:: ./code/lei-jun-test.srt + +English +^^^^^^^ + +Test with a wave file containing English: + +.. code-block:: bash + + cd /tmp/sherpa-onnx + + # Assuem you have downloaded the SenseVoice model + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav + + python3 ./python-api-examples/generate-subtitles.py \ + --silero-vad-model=./silero_vad.onnx \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --num-threads=2 \ + ./Obama.wav + +.. raw:: html + + + + + + + + + + +
Wave filenameContent
Obama.wav + +
+ +It will generate a text file ``Obama.srt``, which is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see ``Obama.srt``. + + .. literalinclude:: ./code/Obama.srt + +WebSocket server and client example +----------------------------------- + +This example shows how to use a WebSocket server with `SenseVoice`_ for speech recognition. + +1. Start the server +^^^^^^^^^^^^^^^^^^^ + +Please run + +.. code-block:: bash + + cd /tmp/sherpa-onnx + + # Assuem you have downloaded the SenseVoice model + + python3 ./python-api-examples/non_streaming_server.py \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt + +You should see the following output after starting the server:: + + 2024-07-28 20:22:38,389 INFO [non_streaming_server.py:1001] {'encoder': '', 'decoder': '', 'joiner': '', 'paraformer': '', 'sense_voice': './sherpa-o + nnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx', 'nemo_ctc': '', 'wenet_ctc': '', 'tdnn_model': '', 'whisper_encoder': '', 'whisper_decod + er': '', 'whisper_language': '', 'whisper_task': 'transcribe', 'whisper_tail_paddings': -1, 'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024 + -07-17/tokens.txt', 'num_threads': 2, 'provider': 'cpu', 'sample_rate': 16000, 'feat_dim': 80, 'decoding_method': 'greedy_search', 'max_active_paths' + : 4, 'hotwords_file': '', 'hotwords_score': 1.5, 'blank_penalty': 0.0, 'port': 6006, 'max_batch_size': 3, 'max_wait_ms': 5, 'nn_pool_size': 1, 'max_m + essage_size': 1048576, 'max_queue_size': 32, 'max_active_connections': 200, 'certificate': None, 'doc_root': './python-api-examples/web'} + 2024-07-28 20:22:41,861 INFO [non_streaming_server.py:647] started + 2024-07-28 20:22:41,861 INFO [non_streaming_server.py:659] No certificate provided + 2024-07-28 20:22:41,866 INFO [server.py:707] server listening on 0.0.0.0:6006 + 2024-07-28 20:22:41,866 INFO [server.py:707] server listening on [::]:6006 + 2024-07-28 20:22:41,866 INFO [non_streaming_server.py:679] Please visit one of the following addresses: + + http://localhost:6006 + +You can either visit the address ``_ or write code to interact with the server. + +In the following, we describe possible approaches for interacting with the WebSocket server. + +.. hint:: + + The WebSocket server is able to handle multiple clients/connections at the same time. + +2. Start the client (decode files sequentially) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following code sends the files in sequential one by one to the server for decoding. + +.. code-block:: bash + + cd /tmp/sherpa-onnx + + python3 ./python-api-examples/offline-websocket-client-decode-files-sequential.py ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + +.. raw:: html + + + + + + + + + + + + + + +
Wave filenameContent
zh.wav + +
en.wav + +
+ +You should see something like below on the server side:: + + 2024-07-28 20:28:15,749 INFO [server.py:642] connection open + 2024-07-28 20:28:15,749 INFO [non_streaming_server.py:835] Connected: ('::1', 53252, 0, 0). Number of connections: 1/200 + 2024-07-28 20:28:15,933 INFO [non_streaming_server.py:851] result: 开放时间早上9点至下午5点。 + 2024-07-28 20:28:16,194 INFO [non_streaming_server.py:851] result: The tribal chieftain called for the boy and presented him with 50 pieces of gold. + 2024-07-28 20:28:16,195 INFO [non_streaming_server.py:819] Disconnected: ('::1', 53252, 0, 0). Number of connections: 0/200 + 2024-07-28 20:28:16,196 INFO [server.py:260] connection closed + +You should see something like below on the client side:: + + 2024-07-28 20:28:15,750 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav + 开放时间早上9点至下午5点。 + 2024-07-28 20:28:15,934 INFO [offline-websocket-client-decode-files-sequential.py:114] Sending ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + The tribal chieftain called for the boy and presented him with 50 pieces of gold. + +3. Start the client (decode files in parallel) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following code sends the files in parallel at the same time to the server for decoding. + +.. code-block:: bash + + cd /tmp/sherpa-onnx + + python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + + +.. raw:: html + + + + + + + + + + + + + + +
Wave filenameContent
zh.wav + +
en.wav + +
+You should see something like below on the server side:: + + 2024-07-28 20:31:10,147 INFO [server.py:642] connection open + 2024-07-28 20:31:10,148 INFO [non_streaming_server.py:835] Connected: ('::1', 53436, 0, 0). Number of connections: 1/200 + 2024-07-28 20:31:10,149 INFO [server.py:642] connection open + 2024-07-28 20:31:10,149 INFO [non_streaming_server.py:835] Connected: ('::1', 53437, 0, 0). Number of connections: 2/200 + 2024-07-28 20:31:10,353 INFO [non_streaming_server.py:851] result: 开放时间早上9点至下午5点。 + 2024-07-28 20:31:10,354 INFO [non_streaming_server.py:819] Disconnected: ('::1', 53436, 0, 0). Number of connections: 1/200 + 2024-07-28 20:31:10,356 INFO [server.py:260] connection closed + 2024-07-28 20:31:10,541 INFO [non_streaming_server.py:851] result: The tribal chieftain called for the boy and presented him with 50 pieces of gold. + 2024-07-28 20:31:10,542 INFO [non_streaming_server.py:819] Disconnected: ('::1', 53437, 0, 0). Number of connections: 0/200 + 2024-07-28 20:31:10,544 INFO [server.py:260] connection closed + +You should see something like below on the client side:: + + 2024-07-28 20:31:10,112 INFO [offline-websocket-client-decode-files-paralell.py:139] {'server_addr': 'localhost', 'server_port': 6006, 'sound_files': ['./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav', './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav']} + 2024-07-28 20:31:10,148 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav + 2024-07-28 20:31:10,191 INFO [offline-websocket-client-decode-files-paralell.py:113] Sending ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + 2024-07-28 20:31:10,353 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav + 开放时间早上9点至下午5点。 + 2024-07-28 20:31:10,542 INFO [offline-websocket-client-decode-files-paralell.py:131] ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav + The tribal chieftain called for the boy and presented him with 50 pieces of gold. + +4. Start the Web browser client +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can also start a browser to interact with the WebSocket server. + +Please visit ``_. + +.. warning:: + + We are not using a certificate to start the server, so the only + ``correct`` URL is ``_. + + All of the following addresses are ``incorrect``: + + - Incorrect/Wrong address: ``_ + - Incorrect/Wrong address: ``_ + - Incorrect/Wrong address: ``_ + - Incorrect/Wrong address: ``_ + - Incorrect/Wrong address: ``_ + +After starting the browser, you should see the following page: + + .. image:: ./pic/python-websocket/client-1.jpg + :align: center + :width: 600 + +Upload a file for recognition +::::::::::::::::::::::::::::: + +If we click ``Upload``, we will see the following page: + + .. image:: ./pic/python-websocket/client-2.jpg + :align: center + :width: 600 + +After clicking ``Click me to connect`` and ``Choose File``, you will +see the recognition result returned from the server: + + .. image:: ./pic/python-websocket/client-3.jpg + :align: center + :width: 600 + +Record your speech with a microphone for recognition +:::::::::::::::::::::::::::::::::::::::::::::::::::: + +If you click ``Offline-Record``, you should see the following page: + + .. image:: ./pic/python-websocket/client-4.jpg + :align: center + :width: 600 + +Please click the button ``Click me to connect``, and then click the button +``Offline-Record``, then speak, finally, click the button ``Offline-Stop``; + +you should see the results from the server. A screenshot is given below: + + .. image:: ./pic/python-websocket/client-5.jpg + :align: center + :width: 600 + +Note that you can save the recorded audio into a wave file for debugging. + +The recorded audio from the above screenshot is saved to ``test.wav`` and +is given below:: + + Input File : 'test.wav' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:07.00 = 112012 samples ~ 525.056 CDDA sectors + File Size : 224k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + +
Wave filenameContent
test.wav + +
diff --git a/docs/source/onnx/speaker-diarization/android.rst b/docs/source/onnx/speaker-diarization/android.rst new file mode 100644 index 000000000..817a0b513 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/android.rst @@ -0,0 +1,21 @@ +Android APKs for speaker diarization +==================================== + +You can find Android APKs for speaker diarization at the following page + + ``_ + +For users from China, you can also visit + + ``_ + + +The source code for the APKs can be found at + + ``_ + +You can find the script for building the APKs at + + ``_ + +Please see :ref:`sherpa-onnx-android` for more details. diff --git a/docs/source/onnx/speaker-diarization/c.rst b/docs/source/onnx/speaker-diarization/c.rst new file mode 100644 index 000000000..1e3dde1d1 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/c.rst @@ -0,0 +1,8 @@ +C API examples +============== + +Please see + + ``_ + +and :ref:`sherpa-onnx-c-api`. diff --git a/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-3dspeaker.int8.txt b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-3dspeaker.int8.txt new file mode 100644 index 000000000..4b4c0df31 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-3dspeaker.int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.638 -- 6.848 speaker_00 +7.017 -- 10.696 speaker_01 +11.472 -- 13.548 speaker_01 +13.784 -- 16.990 speaker_02 +22.154 -- 24.837 speaker_00 +27.655 -- 29.461 speaker_03 +30.018 -- 31.503 speaker_03 +33.680 -- 37.915 speaker_03 +48.040 -- 50.487 speaker_02 +52.546 -- 54.605 speaker_00 + +Duration : 56.861 s +Elapsed seconds: 13.679 s +Real time factor (RTF): 13.679 / 56.861 = 0.241 diff --git a/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-3dspeaker.txt b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-3dspeaker.txt new file mode 100644 index 000000000..3588de26d --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-3dspeaker.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.onnx --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.318 -- 6.865 speaker_00 +7.017 -- 10.747 speaker_01 +11.455 -- 13.632 speaker_01 +13.750 -- 17.041 speaker_02 +22.137 -- 24.837 speaker_00 +27.638 -- 29.478 speaker_03 +30.001 -- 31.553 speaker_03 +33.680 -- 37.932 speaker_03 +48.040 -- 50.470 speaker_02 +52.529 -- 54.605 speaker_00 + +Duration : 56.861 s +Elapsed seconds: 16.870 s +Real time factor (RTF): 16.870 / 56.861 = 0.297 diff --git a/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-nemo.int8.txt b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-nemo.int8.txt new file mode 100644 index 000000000..e60953d2a --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-nemo.int8.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx --embedding.model=./nemo_en_titanet_small.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./nemo_en_titanet_small.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.638 -- 6.848 speaker_00 +7.017 -- 10.696 speaker_01 +11.472 -- 13.548 speaker_01 +13.784 -- 16.990 speaker_02 +22.154 -- 24.837 speaker_00 +27.655 -- 29.461 speaker_03 +30.018 -- 31.503 speaker_03 +33.680 -- 37.915 speaker_03 +48.040 -- 50.487 speaker_02 +52.546 -- 54.605 speaker_00 + +Duration : 56.861 s +Elapsed seconds: 6.231 s +Real time factor (RTF): 6.231 / 56.861 = 0.110 diff --git a/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-nemo.txt b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-nemo.txt new file mode 100644 index 000000000..24b99293d --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/pyannote-segmentation-3-0-nemo.txt @@ -0,0 +1,20 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.onnx --embedding.model=./nemo_en_titanet_small.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./nemo_en_titanet_small.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.318 -- 6.865 speaker_00 +7.017 -- 10.747 speaker_01 +11.455 -- 13.632 speaker_01 +13.750 -- 17.041 speaker_02 +22.137 -- 24.837 speaker_00 +27.638 -- 29.478 speaker_03 +30.001 -- 31.553 speaker_03 +33.680 -- 37.932 speaker_03 +48.040 -- 50.470 speaker_02 +52.529 -- 54.605 speaker_00 + +Duration : 56.861 s +Elapsed seconds: 6.756 s +Real time factor (RTF): 6.756 / 56.861 = 0.119 diff --git a/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-3dspeaker.int8.txt b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-3dspeaker.int8.txt new file mode 100644 index 000000000..1afe1d5e0 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-3dspeaker.int8.txt @@ -0,0 +1,18 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.int8.onnx --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-reverb-diarization-v1/model.int8.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.031 -- 6.815 speaker_01 +7.017 -- 13.666 speaker_00 +13.784 -- 16.973 speaker_02 +21.023 -- 24.854 speaker_01 +27.655 -- 38.084 speaker_03 +38.084 -- 46.943 speaker_00 +45.526 -- 50.352 speaker_02 +52.580 -- 54.622 speaker_01 + +Duration : 56.861 s +Elapsed seconds: 22.323 s +Real time factor (RTF): 22.323 / 56.861 = 0.393 diff --git a/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-3dspeaker.txt b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-3dspeaker.txt new file mode 100644 index 000000000..5b6d399c0 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-3dspeaker.txt @@ -0,0 +1,18 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.onnx --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-reverb-diarization-v1/model.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.031 -- 6.798 speaker_01 +7.017 -- 13.649 speaker_00 +13.801 -- 16.957 speaker_02 +21.023 -- 24.820 speaker_01 +27.638 -- 38.017 speaker_03 +44.345 -- 45.526 speaker_00 +45.526 -- 50.268 speaker_02 +52.563 -- 54.605 speaker_01 + +Duration : 56.861 s +Elapsed seconds: 25.715 s +Real time factor (RTF): 25.715 / 56.861 = 0.452 diff --git a/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-nemo.int8.txt b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-nemo.int8.txt new file mode 100644 index 000000000..9dd925263 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-nemo.int8.txt @@ -0,0 +1,18 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.int8.onnx --embedding.model=./nemo_en_titanet_small.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-reverb-diarization-v1/model.int8.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./nemo_en_titanet_small.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.031 -- 6.815 speaker_01 +7.017 -- 13.666 speaker_00 +13.784 -- 16.973 speaker_02 +21.023 -- 24.854 speaker_01 +27.655 -- 38.877 speaker_02 +38.168 -- 45.914 speaker_03 +45.526 -- 50.352 speaker_02 +52.580 -- 54.622 speaker_01 + +Duration : 56.861 s +Elapsed seconds: 9.688 s +Real time factor (RTF): 9.688 / 56.861 = 0.170 diff --git a/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-nemo.txt b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-nemo.txt new file mode 100644 index 000000000..3af010570 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/code/revai-segmentation-3-0-nemo.txt @@ -0,0 +1,18 @@ +/Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-speaker-diarization --clustering.num-clusters=4 --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.onnx --embedding.model=./nemo_en_titanet_small.onnx ./0-four-speakers-zh.wav + +OfflineSpeakerDiarizationConfig(segmentation=OfflineSpeakerSegmentationModelConfig(pyannote=OfflineSpeakerSegmentationPyannoteModelConfig(model="./sherpa-onnx-reverb-diarization-v1/model.onnx"), num_threads=1, debug=False, provider="cpu"), embedding=SpeakerEmbeddingExtractorConfig(model="./nemo_en_titanet_small.onnx", num_threads=1, debug=False, provider="cpu"), clustering=FastClusteringConfig(num_clusters=4, threshold=0.5), min_duration_on=0.3, min_duration_off=0.5) + +Started + +0.031 -- 6.798 speaker_01 +7.017 -- 13.649 speaker_00 +13.801 -- 16.957 speaker_02 +21.023 -- 24.820 speaker_01 +27.638 -- 38.017 speaker_02 +44.345 -- 45.357 speaker_03 +45.290 -- 50.268 speaker_02 +52.563 -- 54.605 speaker_01 + +Duration : 56.861 s +Elapsed seconds: 11.465 s +Real time factor (RTF): 11.465 / 56.861 = 0.202 diff --git a/docs/source/onnx/speaker-diarization/cpp.rst b/docs/source/onnx/speaker-diarization/cpp.rst new file mode 100644 index 000000000..877cd9fc8 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/cpp.rst @@ -0,0 +1,6 @@ +C++ API examples +================ + +Please see + + ``_ diff --git a/docs/source/onnx/speaker-diarization/csharp.rst b/docs/source/onnx/speaker-diarization/csharp.rst new file mode 100644 index 000000000..efddb1b43 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/csharp.rst @@ -0,0 +1,6 @@ +C# API examples +=============== + +Please see + + ``_ diff --git a/docs/source/onnx/speaker-diarization/dart.rst b/docs/source/onnx/speaker-diarization/dart.rst new file mode 100644 index 000000000..33367b748 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/dart.rst @@ -0,0 +1,6 @@ +Dart API examples +================= + +Please see + + ``_ diff --git a/docs/source/onnx/speaker-diarization/go.rst b/docs/source/onnx/speaker-diarization/go.rst new file mode 100644 index 000000000..b2bc248de --- /dev/null +++ b/docs/source/onnx/speaker-diarization/go.rst @@ -0,0 +1,6 @@ +Go API examples +=============== + +Please see + + ``_ diff --git a/docs/source/onnx/speaker-diarization/hf.rst b/docs/source/onnx/speaker-diarization/hf.rst new file mode 100644 index 000000000..c0140ad39 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/hf.rst @@ -0,0 +1,8 @@ +Hugginface space for speaker diarization +======================================== + +Please visit + + ``_ + +You don't need to install anything to try it in your browser. diff --git a/docs/source/onnx/speaker-diarization/index.rst b/docs/source/onnx/speaker-diarization/index.rst new file mode 100644 index 000000000..9d0691d9e --- /dev/null +++ b/docs/source/onnx/speaker-diarization/index.rst @@ -0,0 +1,31 @@ +Speaker Diarization +=================== + +This page describes how to use `sherpa-onnx`_ for speaker diarization. + +Pre-trained models for speaker segmentation can be found +at ``_ + +Pre-trained models for speaker embedding extraction can be found +at ``_ + +In the following, we describe different programming language APIs for speaker diarization. + +.. toctree:: + :maxdepth: 5 + + ./models.rst + ./hf.rst + ./android.rst + ./c.rst + ./cpp.rst + ./csharp.rst + ./dart.rst + ./go.rst + ./java.rst + ./javascript.rst + ./kotlin.rst + ./pascal.rst + ./python.rst + ./rust.rst + ./swift.rst diff --git a/docs/source/onnx/speaker-diarization/java.rst b/docs/source/onnx/speaker-diarization/java.rst new file mode 100644 index 000000000..6f9b3315e --- /dev/null +++ b/docs/source/onnx/speaker-diarization/java.rst @@ -0,0 +1,9 @@ +Java API examples +================= + +Please see + + ``_ + +and :ref:`sherpa-onnx-java-api`. + diff --git a/docs/source/onnx/speaker-diarization/javascript.rst b/docs/source/onnx/speaker-diarization/javascript.rst new file mode 100644 index 000000000..094754d24 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/javascript.rst @@ -0,0 +1,36 @@ +JavaScript API examples +======================= + +We provide two npm packages. + +WebAssembly based npm package +----------------------------- + +You can find the package at + + ``_ + +This package does not support multi-threading. + +The example for speaker diarzation can be found at + + ``_ + +node-addon based npm package +---------------------------- + +You can find the package at + + ``_ + +This package supports multi-threading. + +Please see + + ``_ + +for installation. + +The example for speaker diarization can be found at + + ``_ diff --git a/docs/source/onnx/speaker-diarization/kotlin.rst b/docs/source/onnx/speaker-diarization/kotlin.rst new file mode 100644 index 000000000..39d7eb963 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/kotlin.rst @@ -0,0 +1,6 @@ +Kotlin API examples +=================== + +Please see + + ``_ diff --git a/docs/source/onnx/speaker-diarization/models.rst b/docs/source/onnx/speaker-diarization/models.rst new file mode 100644 index 000000000..497b24fb7 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/models.rst @@ -0,0 +1,327 @@ +Pre-trained models +================== + +This page lists pre-trained models for speaker segmentation. + +Models for speaker embedding extraction can be found at + + ``_ + +Colab notebook +-------------- + + +We provide a colab notebook +|speaker diarization with sherpa-onnx colab notebook| +for you to try this section step by step. + +.. |speaker diarization with sherpa-onnx colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_speaker_diarization.ipynb + +sherpa-onnx-pyannote-segmentation-3-0 +------------------------------------- + +This model is converted from ``_. +You can find the conversion script at ``_. + +In the following, we describe how to use it together with +a speaker embedding extraction model for speaker diarization. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following code to download the model: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + + ls -lh sherpa-onnx-pyannote-segmentation-3-0/{*.onnx,LICENSE,README.md} + +You should see the following output:: + + -rw-r--r-- 1 fangjun staff 1.0K Oct 8 20:54 sherpa-onnx-pyannote-segmentation-3-0/LICENSE + -rw-r--r-- 1 fangjun staff 115B Oct 8 20:54 sherpa-onnx-pyannote-segmentation-3-0/README.md + -rw-r--r-- 1 fangjun staff 1.5M Oct 8 20:54 sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx + -rw-r--r-- 1 fangjun staff 5.7M Oct 8 20:54 sherpa-onnx-pyannote-segmentation-3-0/model.onnx + +Usage for speaker diarization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First, let's download a test wave file. The model expects wave files of 16kHz, 16-bit and a single channel. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +Next, let's download a model for extracting speaker embeddings. You can find lots of models from +``_. We +download two models in this example:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/nemo_en_titanet_small.onnx + +Now let's run it. + +3D-Speaker + model.onnx +::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.onnx \ + --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \ + ./0-four-speakers-zh.wav + + # Note: Since we know there are 4 speakers in ./0-four-speakers-zh.wav file, we + # provide the argument --clustering.num-clusters=4. + # If you don't have such information, please use the argument --clustering.cluster-threshold. + # A larger threshold results in fewer speakers. + # A smaller threshold results in more speakers. + # + # Hint: You can use --clustering.cluster-threshold=0.9 for this specific wave file. + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/pyannote-segmentation-3-0-3dspeaker.txt + +3D-Speaker + model.int8.onnx +::::::::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx \ + --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \ + ./0-four-speakers-zh.wav + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/pyannote-segmentation-3-0-3dspeaker.int8.txt + + +NeMo + model.onnx +::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.onnx \ + --embedding.model=./nemo_en_titanet_small.onnx \ + ./0-four-speakers-zh.wav + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/pyannote-segmentation-3-0-nemo.txt + +NeMo + model.int8.onnx +:::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-pyannote-segmentation-3-0/model.int8.onnx \ + --embedding.model=./nemo_en_titanet_small.onnx \ + ./0-four-speakers-zh.wav + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/pyannote-segmentation-3-0-nemo.int8.txt + +sherpa-onnx-reverb-diarization-v1 +--------------------------------- + +This model is converted from ``_. +You can find the conversion script at ``_. + +.. caution:: + + It is accessible under a ``non-commercial`` license. + You can find its license at ``_. + +In the following, we describe how to use it together with +a speaker embedding extraction model for speaker diarization. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following code to download the model: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-reverb-diarization-v1.tar.bz2 + tar xvf sherpa-onnx-reverb-diarization-v1.tar.bz2 + rm sherpa-onnx-reverb-diarization-v1.tar.bz2 + + ls -lh sherpa-onnx-reverb-diarization-v1/{*.onnx,LICENSE,README.md} + +You should see the following output:: + + -rw-r--r-- 1 fangjun staff 11K Oct 17 10:49 sherpa-onnx-reverb-diarization-v1/LICENSE + -rw-r--r-- 1 fangjun staff 320B Oct 17 10:49 sherpa-onnx-reverb-diarization-v1/README.md + -rw-r--r-- 1 fangjun staff 2.3M Oct 17 10:49 sherpa-onnx-reverb-diarization-v1/model.int8.onnx + -rw-r--r-- 1 fangjun staff 9.1M Oct 17 10:49 sherpa-onnx-reverb-diarization-v1/model.onnx + +Usage for speaker diarization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First, let's download a test wave file. The model expects wave files of 16kHz, 16-bit and a single channel. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +Next, let's download a model for extracting speaker embeddings. You can find lots of models from +``_. We +download two models in this example:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/nemo_en_titanet_small.onnx + +Now let's run it. + +3D-Speaker + model.onnx +::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.onnx \ + --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \ + ./0-four-speakers-zh.wav + + # Note: Since we know there are 4 speakers in ./0-four-speakers-zh.wav file, we + # provide the argument --clustering.num-clusters=4. + # If you don't have such information, please use the argument --clustering.cluster-threshold. + # A larger threshold results in fewer speakers. + # A smaller threshold results in more speakers. + # + # Hint: You can use --clustering.cluster-threshold=0.9 for this specific wave file. + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/revai-segmentation-3-0-3dspeaker.txt + +3D-Speaker + model.int8.onnx +:::::::::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.int8.onnx \ + --embedding.model=./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \ + ./0-four-speakers-zh.wav + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/revai-segmentation-3-0-3dspeaker.int8.txt + +NeMo + model.onnx +::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.onnx \ + --embedding.model=./nemo_en_titanet_small.onnx \ + ./0-four-speakers-zh.wav + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/revai-segmentation-3-0-nemo.txt + +NeMo + model.int8.onnx +:::::::::::::::::::::: + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-speaker-diarization \ + --clustering.num-clusters=4 \ + --segmentation.pyannote-model=./sherpa-onnx-reverb-diarization-v1/model.int8.onnx \ + --embedding.model=./nemo_en_titanet_small.onnx \ + ./0-four-speakers-zh.wav + +The output is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see the output. + + .. literalinclude:: ./code/revai-segmentation-3-0-nemo.int8.txt diff --git a/docs/source/onnx/speaker-diarization/pascal.rst b/docs/source/onnx/speaker-diarization/pascal.rst new file mode 100644 index 000000000..e4c4a7f02 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/pascal.rst @@ -0,0 +1,7 @@ +Pascal API examples +=================== + +Please see + + ``_ + diff --git a/docs/source/onnx/speaker-diarization/python.rst b/docs/source/onnx/speaker-diarization/python.rst new file mode 100644 index 000000000..c19814b65 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/python.rst @@ -0,0 +1,10 @@ +Python API examples +=================== + +.. note:: + + You need to install `sherpa-onnx>=1.10.28`. + + +Please see ``_ +for usages. diff --git a/docs/source/onnx/speaker-diarization/rust.rst b/docs/source/onnx/speaker-diarization/rust.rst new file mode 100644 index 000000000..621607e54 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/rust.rst @@ -0,0 +1,5 @@ +Rust API examples +================= + +Please see ``_ +for usages. diff --git a/docs/source/onnx/speaker-diarization/swift.rst b/docs/source/onnx/speaker-diarization/swift.rst new file mode 100644 index 000000000..0092cdd90 --- /dev/null +++ b/docs/source/onnx/speaker-diarization/swift.rst @@ -0,0 +1,6 @@ +Swift API examples +================== + +Please see + + ``_ diff --git a/docs/source/onnx/speaker-identification/index.rst b/docs/source/onnx/speaker-identification/index.rst new file mode 100644 index 000000000..3f2b80745 --- /dev/null +++ b/docs/source/onnx/speaker-identification/index.rst @@ -0,0 +1,19 @@ +Speaker Identification +====================== + +This page describes how to use `sherpa-onnx`_ for speaker identification. + +Please first follow :ref:`install_sherpa_onnx` and/or :ref:`install_sherpa_onnx_python` +to install `sherpa-onnx`_ before you continue. + + +Pre-trained models can be found at ``_ + +.. hint:: + + You can find Android APKs for each model at the following page + + ``_ + +Please refer to ``_ +for usage examples. diff --git a/docs/source/onnx/speech-enhancment/hf.rst b/docs/source/onnx/speech-enhancment/hf.rst new file mode 100644 index 000000000..4e3e85d54 --- /dev/null +++ b/docs/source/onnx/speech-enhancment/hf.rst @@ -0,0 +1,17 @@ +Hugginface space for speech enhancement +======================================= + +Please visit + + ``_ + +You don't need to install anything to try it in your browser. + +If you don't have access to huggingface, please try the following mirror + + ``_ + +For modelscope users, you can visit + + ``_ + diff --git a/docs/source/onnx/speech-enhancment/index.rst b/docs/source/onnx/speech-enhancment/index.rst new file mode 100644 index 000000000..55e3e17c2 --- /dev/null +++ b/docs/source/onnx/speech-enhancment/index.rst @@ -0,0 +1,10 @@ +Speech enhancement +================== + +This page describes how to use `sherpa-onnx`_ for speech enhancement. + +.. toctree:: + :maxdepth: 5 + + ./models.rst + ./hf.rst diff --git a/docs/source/onnx/speech-enhancment/models.rst b/docs/source/onnx/speech-enhancment/models.rst new file mode 100644 index 000000000..f6b11cfe0 --- /dev/null +++ b/docs/source/onnx/speech-enhancment/models.rst @@ -0,0 +1,118 @@ +Pre-trained models +================== + +Pre-trained models can be found +at ``_ + +gtcrn_simple +------------ + +This model is from ``_. +You can find its paper at ``_. + +In the following, we describe how to download and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following code to download the model: + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx + +After downloading, you can check its file size:: + + ls -lh gtcrn_simple.onnx + -rw-r--r-- 1 fangjun staff 523K Mar 10 18:44 gtcrn_simple.onnx + +Then we download a wave file for testing + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/speech_with_noise.wav + +.. hint:: + + You can find more test wave files at + + ``_ + +The info about the downloaded test wave file is given below:: + + soxi ./speech_with_noise.wav + + Input File : './speech_with_noise.wav' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:02.40 = 38363 samples ~ 179.827 CDDA sectors + File Size : 76.8k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM + +Now we can run:: + + ./build/bin/sherpa-onnx-offline-denoiser \ + --speech-denoiser-gtcrn-model=./gtcrn_simple.onnx \ + --input-wav=./speech_with_noise.wav \ + --output-wav=./enhanced-16k.wav + +The log of the above command is:: + + /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:Read:375 ./build/bin/sherpa-onnx-offline-denoiser --speech-denoiser-gtcrn-model=./gtcrn_simple.onnx --input-wav=./speech_with_noise.wav --output-wav=./enhanced-16k.wav + + OfflineSpeechDenoiserConfig(model=OfflineSpeechDenoiserModelConfig(gtcrn=OfflineSpeechDenoiserGtcrnModelConfig(model="./gtcrn_simple.onnx"), num_threads=1, debug=False, provider="cpu")) + Started + Done + Saved to ./enhanced-16k.wav + num threads: 1 + Elapsed seconds: 0.171 s + Real time factor (RTF): 0.171 / 2.398 = 0.071 + +.. code-block:: bash + + ls -lh enhanced-16k.wav + -rw-r--r-- 1 fangjun staff 75K Mar 22 16:08 enhanced-16k.wav + + soxi ./enhanced-16k.wav + + Input File : './enhanced-16k.wav' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:02.38 = 38144 samples ~ 178.8 CDDA sectors + File Size : 76.3k + Bit Rate : 256k + Sample Encoding: 16-bit Signed Integer PCM + +For comparison, we give the two wave files below so that you can listen to them. + +.. raw:: html + + + + + + + + + + + + + + + + +
Wave filenameContent
speech_with_noise.wav + +
enhanced-16k.wav + +
diff --git a/docs/source/onnx/spoken-language-identification/index.rst b/docs/source/onnx/spoken-language-identification/index.rst new file mode 100644 index 000000000..bb2906cde --- /dev/null +++ b/docs/source/onnx/spoken-language-identification/index.rst @@ -0,0 +1,9 @@ +Spoken language identification +============================== + +This section describes how to use `sherpa-onnx`_ for spoken language identification. + +.. toctree:: + :maxdepth: 5 + + ./pretrained_models.rst diff --git a/docs/source/onnx/spoken-language-identification/pretrained_models.rst b/docs/source/onnx/spoken-language-identification/pretrained_models.rst new file mode 100644 index 000000000..1dca1cb0f --- /dev/null +++ b/docs/source/onnx/spoken-language-identification/pretrained_models.rst @@ -0,0 +1,144 @@ +Pre-trained models +================== + +whisper +------- + +Currently, we support whisper multilingual models for spoken language identification. + +.. list-table:: + + * - Model type + - Huggingface repo + * - ``tiny`` + - ``_ + * - ``base`` + - ``_ + * - ``small`` + - ``_ + * - ``medium`` + - ``_ + +.. hint:: + + You can also download them from + + ``_ + + +In the following, we use the ``tiny`` model as an example. You can +replace ``tiny`` with ``base``, ``small``, or ``medium`` and everything still holds. + +Download the model +^^^^^^^^^^^^^^^^^^ + +Please use the following commands to download the ``tiny`` model:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 + + tar xvf sherpa-onnx-whisper-tiny.tar.bz2 + rm sherpa-onnx-whisper-tiny.tar.bz2 + +You should find the following files after unzipping:: + + -rw-r--r-- 1 fangjun staff 427B Jan 31 16:21 README.md + -rwxr-xr-x 1 fangjun staff 19K Jan 31 16:21 export-onnx.py + -rw-r--r-- 1 fangjun staff 15B Jan 31 16:21 requirements.txt + -rwxr-xr-x 1 fangjun staff 12K Jan 31 16:21 test.py + drwxr-xr-x 6 fangjun staff 192B Jan 31 16:22 test_wavs + -rw-r--r-- 1 fangjun staff 86M Jan 31 16:22 tiny-decoder.int8.onnx + -rw-r--r-- 1 fangjun staff 109M Jan 31 16:22 tiny-decoder.onnx + -rw-r--r-- 1 fangjun staff 12M Jan 31 16:22 tiny-encoder.int8.onnx + -rw-r--r-- 1 fangjun staff 36M Jan 31 16:22 tiny-encoder.onnx + -rw-r--r-- 1 fangjun staff 798K Jan 31 16:22 tiny-tokens.txt + +Download test waves +^^^^^^^^^^^^^^^^^^^ + +Please use the following command to download test data:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2 + + tar xvf spoken-language-identification-test-wavs.tar.bz2 + rm spoken-language-identification-test-wavs.tar.bz2 + +You can find the following test files after unzipping:: + + -rw-r--r-- 1 fangjun staff 222K Mar 24 12:51 ar-arabic.wav + -rw-r--r--@ 1 fangjun staff 137K Mar 24 13:09 bg-bulgarian.wav + -rw-r--r-- 1 fangjun staff 83K Mar 24 13:07 cs-czech.wav + -rw-r--r-- 1 fangjun staff 112K Mar 24 13:07 da-danish.wav + -rw-r--r-- 1 fangjun staff 199K Mar 24 12:50 de-german.wav + -rw-r--r-- 1 fangjun staff 207K Mar 24 13:06 el-greek.wav + -rw-r--r-- 1 fangjun staff 31K Mar 24 12:45 en-english.wav + -rw-r--r--@ 1 fangjun staff 77K Mar 24 12:23 es-spanish.wav + -rw-r--r--@ 1 fangjun staff 371K Mar 24 12:21 fa-persian.wav + -rw-r--r-- 1 fangjun staff 136K Mar 24 13:08 fi-finnish.wav + -rw-r--r-- 1 fangjun staff 112K Mar 24 12:49 fr-french.wav + -rw-r--r-- 1 fangjun staff 179K Mar 24 12:47 hi-hindi.wav + -rw-r--r--@ 1 fangjun staff 177K Mar 24 12:29 hr-croatian.wav + -rw-r--r-- 1 fangjun staff 167K Mar 24 12:53 id-indonesian.wav + -rw-r--r-- 1 fangjun staff 136K Mar 24 12:54 it-italian.wav + -rw-r--r-- 1 fangjun staff 46K Mar 24 12:44 ja-japanese.wav + -rw-r--r--@ 1 fangjun staff 122K Mar 24 12:52 ko-korean.wav + -rw-r--r-- 1 fangjun staff 85K Mar 24 12:54 nl-dutch.wav + -rw-r--r--@ 1 fangjun staff 241K Mar 24 12:38 no-norwegian.wav + -rw-r--r--@ 1 fangjun staff 121K Mar 24 12:35 po-polish.wav + -rw-r--r-- 1 fangjun staff 166K Mar 24 12:48 pt-portuguese.wav + -rw-r--r--@ 1 fangjun staff 144K Mar 24 12:33 ro-romanian.wav + -rw-r--r-- 1 fangjun staff 111K Mar 24 12:51 ru-russian.wav + -rw-r--r--@ 1 fangjun staff 239K Mar 24 12:40 sk-slovak.wav + -rw-r--r-- 1 fangjun staff 196K Mar 24 13:01 sv-swedish.wav + -rw-r--r-- 1 fangjun staff 106K Mar 24 13:14 ta-tamil.wav + -rw-r--r-- 1 fangjun staff 104K Mar 24 13:02 tl-tagalog.wav + -rw-r--r-- 1 fangjun staff 76K Mar 24 13:00 tr-turkish.wav + -rw-r--r-- 1 fangjun staff 188K Mar 24 13:05 uk-ukrainian.wav + -rw-r--r-- 1 fangjun staff 181K Mar 24 13:20 zh-chinese.wav + +Test with Python APIs +^^^^^^^^^^^^^^^^^^^^^ + +After installing `sherpa-onnx`_ either from source or from using ``pip install sherpa-onnx``, you can run:: + + python3 ./python-api-examples/spoken-language-identification.py \ + --whisper-encoder ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx \ + --whisper-decoder ./sherpa-onnx-whisper-tiny/tiny-decoder.onnx \ + ./spoken-language-identification-test-wavs/de-german.wav + +You should see the following output:: + + + 2024-04-17 15:53:23,104 INFO [spoken-language-identification.py:158] File: ./spoken-language-identification-test-wavs/de-german.wav + 2024-04-17 15:53:23,104 INFO [spoken-language-identification.py:159] Detected language: de + 2024-04-17 15:53:23,104 INFO [spoken-language-identification.py:160] Elapsed seconds: 0.275 + 2024-04-17 15:53:23,105 INFO [spoken-language-identification.py:161] Audio duration in seconds: 6.374 + 2024-04-17 15:53:23,105 INFO [spoken-language-identification.py:162] RTF: 0.275/6.374 = 0.043 + + +.. hint:: + + You can find ``spoken-language-identification.py`` at + + ``_ + +Android APKs +^^^^^^^^^^^^ + +You can find pre-built Android APKs for spoken language identification at the following address: + + ``_ + +Huggingface space +^^^^^^^^^^^^^^^^^ + +We provide a huggingface space for spoken language identification. + +You can visit the following URL: + + ``_ + +.. note:: + + For Chinese users, you can use the following mirror: + + ``_ diff --git a/docs/source/onnx/swift-api/build.rst b/docs/source/onnx/swift-api/build.rst new file mode 100644 index 000000000..11fb67c33 --- /dev/null +++ b/docs/source/onnx/swift-api/build.rst @@ -0,0 +1,12 @@ +Build +===== + +Please use the following script to build `sherpa-onnx`_ for Swift API: + + ``_ + +The following is an example command:: + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + ./build-swift-macos.sh diff --git a/docs/source/onnx/swift-api/examples.rst b/docs/source/onnx/swift-api/examples.rst new file mode 100644 index 000000000..16d6c5820 --- /dev/null +++ b/docs/source/onnx/swift-api/examples.rst @@ -0,0 +1,13 @@ +Examples +======== + +Please see ``_ + + +Each example has a corresponding shell script. Please use the shell script to run it. + + +For instance, + + - to run the text-to-speech example, please use ``_ + - to run the speech-to-text example, please use ``_ diff --git a/docs/source/onnx/swift-api/index.rst b/docs/source/onnx/swift-api/index.rst new file mode 100644 index 000000000..c40cd508c --- /dev/null +++ b/docs/source/onnx/swift-api/index.rst @@ -0,0 +1,10 @@ +.. _sherpa-onnx-swift-api: + +Swift API +========= + +.. toctree:: + :maxdepth: 5 + + ./build.rst + ./examples.rst diff --git a/docs/source/onnx/tts/code/piper.py b/docs/source/onnx/tts/code/piper.py new file mode 100644 index 000000000..795303355 --- /dev/null +++ b/docs/source/onnx/tts/code/piper.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import json +import os +from typing import Any, Dict + +import onnx + + +def add_meta_data(filename: str, meta_data: Dict[str, Any]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +def load_config(model): + with open(f"{model}.json", "r") as file: + config = json.load(file) + return config + + +def generate_tokens(config): + id_map = config["phoneme_id_map"] + with open("tokens.txt", "w", encoding="utf-8") as f: + for s, i in id_map.items(): + f.write(f"{s} {i[0]}\n") + print("Generated tokens.txt") + + +def main(): + # Caution: Please change the filename + filename = "en_US-amy-low.onnx" + + # The rest of the file should not be changed. + # You only need to change the above filename = "xxx.onxx" in this file + + config = load_config(filename) + + print("generate tokens") + generate_tokens(config) + + print("add model metadata") + meta_data = { + "model_type": "vits", + "comment": "piper", # must be piper for models from piper + "language": config["language"]["name_english"], + "voice": config["espeak"]["voice"], # e.g., en-us + "has_espeak": 1, + "n_speakers": config["num_speakers"], + "sample_rate": config["audio"]["sample_rate"], + } + print(meta_data) + add_meta_data(filename, meta_data) + + +main() diff --git a/docs/source/onnx/tts/code/vits-mms.py b/docs/source/onnx/tts/code/vits-mms.py new file mode 100755 index 000000000..693da4292 --- /dev/null +++ b/docs/source/onnx/tts/code/vits-mms.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + +import collections +import os +from typing import Any, Dict + +import onnx +import torch +from vits import commons, utils +from vits.models import SynthesizerTrn + + +class OnnxModel(torch.nn.Module): + def __init__(self, model: SynthesizerTrn): + super().__init__() + self.model = model + + def forward( + self, + x, + x_lengths, + noise_scale=0.667, + length_scale=1.0, + noise_scale_w=0.8, + ): + return self.model.infer( + x=x, + x_lengths=x_lengths, + noise_scale=noise_scale, + length_scale=length_scale, + noise_scale_w=noise_scale_w, + )[0] + + +def add_meta_data(filename: str, meta_data: Dict[str, Any]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +def load_vocab(): + return [ + x.replace("\n", "") for x in open("vocab.txt", encoding="utf-8").readlines() + ] + + +@torch.no_grad() +def main(): + hps = utils.get_hparams_from_file("config.json") + is_uroman = hps.data.training_files.split(".")[-1] == "uroman" + if is_uroman: + raise ValueError("We don't support uroman!") + + symbols = load_vocab() + + # Now generate tokens.txt + all_upper_tokens = [i.upper() for i in symbols] + duplicate = set( + [ + item + for item, count in collections.Counter(all_upper_tokens).items() + if count > 1 + ] + ) + + print("generate tokens.txt") + + with open("tokens.txt", "w", encoding="utf-8") as f: + for idx, token in enumerate(symbols): + f.write(f"{token} {idx}\n") + + # both upper case and lower case correspond to the same ID + if ( + token.lower() != token.upper() + and len(token.upper()) == 1 + and token.upper() not in duplicate + ): + f.write(f"{token.upper()} {idx}\n") + + net_g = SynthesizerTrn( + len(symbols), + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + ) + net_g.cpu() + _ = net_g.eval() + + _ = utils.load_checkpoint("G_100000.pth", net_g, None) + + model = OnnxModel(net_g) + + x = torch.randint(low=1, high=10, size=(50,), dtype=torch.int64) + x = x.unsqueeze(0) + + x_length = torch.tensor([x.shape[1]], dtype=torch.int64) + noise_scale = torch.tensor([1], dtype=torch.float32) + length_scale = torch.tensor([1], dtype=torch.float32) + noise_scale_w = torch.tensor([1], dtype=torch.float32) + + opset_version = 13 + + filename = "model.onnx" + + torch.onnx.export( + model, + (x, x_length, noise_scale, length_scale, noise_scale_w), + filename, + opset_version=opset_version, + input_names=[ + "x", + "x_length", + "noise_scale", + "length_scale", + "noise_scale_w", + ], + output_names=["y"], + dynamic_axes={ + "x": {0: "N", 1: "L"}, # n_audio is also known as batch_size + "x_length": {0: "N"}, + "y": {0: "N", 2: "L"}, + }, + ) + meta_data = { + "model_type": "vits", + "comment": "mms", + "url": "https://huggingface.co/facebook/mms-tts/tree/main", + "add_blank": int(hps.data.add_blank), + "language": os.environ.get("language", "unknown"), + "frontend": "characters", + "n_speakers": int(hps.data.n_speakers), + "sample_rate": hps.data.sampling_rate, + } + print("meta_data", meta_data) + add_meta_data(filename=filename, meta_data=meta_data) + + +main() diff --git a/docs/source/onnx/tts/faq.rst b/docs/source/onnx/tts/faq.rst new file mode 100644 index 000000000..4e31889f1 --- /dev/null +++ b/docs/source/onnx/tts/faq.rst @@ -0,0 +1,98 @@ +.. _tts_faqs: + +Frequently Asked Question (FAQs) +================================ + +Is there a colab notebook +------------------------- + +Yes, we have one. Please see + + ``_ + +It shows you + + - How to install `sherpa-onnx`_ + - How to download pre-trained text-to-speech (TTS) models + - How to use `sherpa-onnx`_ with pre-trained models for TTS + +.. _how_to_enable_utf8_on_windows: + +How to enable UTF-8 on Windows +------------------------------ + +For Chinese Users: 如果英文模型正常,中文模型不正常。请看下面的解决方法 +和这个 `issue `_ + +Please see `win11 下永久设置cmd编码为utf-8 `_ + +.. image:: ./pic/utf8-setting.jpg + :alt: screenshot of how to set utf8 on windows + +How to install sherpa-onnx for TTS +---------------------------------- + +For Python users +^^^^^^^^^^^^^^^^ + +The fastest way to install `sherpa-onnx`_ for TTS is: + +.. code-block:: bash + + pip install sherpa-onnx + +The above command does ``NOT`` require you to install a C++ compiler and it +supports a variety of platforms, such as: + + - Linux + + - x64 + - arm, e.g., 32-bit Raspberry Pi + - arm64, e.g., 64-bit Raspberry Pi + + - Windows + + - x64, e.g., 64-bit Windows + - x86, e.g., 32-bit Windows + + - macOS + + - x64 + - arm64, e.g., M1 and M2 chips + +If you want to build the `sherpa-onnx`_ Python package from source, please +refer to :ref:`install_sherpa_onnx_python`. + +After installation, please refer to +``_ +for example usage. + +.. hint:: + + ``pip install sherpa-onnx`` also installs an executable ``sherpa-onnx-offline-tts``. + The directory where it is installed should be already on your ``PATH`` after you + activate your Python virtual environment. + + You can run + + .. code-block:: bash + + sherpa-onnx-offline-tts --help + + in your terminal to get the help information about it. + + +Build from source +^^^^^^^^^^^^^^^^^ + +Please refer to :ref:`install_sherpa_onnx`. + +Where to get pre-trained TTS models +----------------------------------- + +Please refer to :ref:`onnx-tts-pretrained-models`. + +How to handle OOVs +------------------ + +Please add them to ``lexicon.txt``. diff --git a/docs/source/onnx/tts/hf-space.rst b/docs/source/onnx/tts/hf-space.rst new file mode 100644 index 000000000..4a793b26c --- /dev/null +++ b/docs/source/onnx/tts/hf-space.rst @@ -0,0 +1,24 @@ +Huggingface space +================= + +We provide a huggingface space where you can try text-to-speech with +`sherpa-onnx`_ from within your browser without installing anything. + +.. hint:: + + We also have spaces using `WebAssembly`_ for text-to-speech. Please + see :ref:`try sherpa onnx wasm with huggingface`. + +All you need is a browser, either running on your desk computer, your phone, or +your iPad, etc. + + +Please visit + + ``_ + +.. image:: ./pic/hf-space.png + :alt: screenshot of ``_ + :target: https://huggingface.co/spaces/k2-fsa/text-to-speech + + diff --git a/docs/source/onnx/tts/index.rst b/docs/source/onnx/tts/index.rst new file mode 100644 index 000000000..e27649d12 --- /dev/null +++ b/docs/source/onnx/tts/index.rst @@ -0,0 +1,18 @@ +Text-to-speech (TTS) +==================== + +This page describes how to use `sherpa-onnx`_ for text-to-speech (TTS). + + +Please first follow :ref:`install_sherpa_onnx` and/or :ref:`install_sherpa_onnx_python` +to install `sherpa-onnx`_ before you continue. + +.. toctree:: + :maxdepth: 5 + + ./hf-space.rst + ./pretrained_models/index + ./wasm/index + ./piper + ./mms + ./faq diff --git a/docs/source/onnx/tts/mms.rst b/docs/source/onnx/tts/mms.rst new file mode 100644 index 000000000..3e7518c84 --- /dev/null +++ b/docs/source/onnx/tts/mms.rst @@ -0,0 +1,125 @@ +MMS +=== + +This section describes how to convert models +from ``_ +to `sherpa-onnx`_. + +Note that `facebook/mms-tts `_ +supports more than 1000 languages. You can try models from +`facebook/mms-tts `_ at +the huggingface space ``_. + +You can try the converted models by visiting ``_. +To download the converted models, please visit ``_. +If a filename contains ``vits-mms``, it means the model is from +`facebook/mms-tts `_. + +Install dependencies +-------------------- + +.. code-block:: bash + + pip install -qq onnx scipy Cython + pip install -qq torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + +Download the model file +----------------------- + +Suppose that we want to convert the English model, we need to +use the following commands to download the model: + +.. code-block:: bash + + name=eng + wget -q https://huggingface.co/facebook/mms-tts/resolve/main/models/$name/G_100000.pth + wget -q https://huggingface.co/facebook/mms-tts/resolve/main/models/$name/config.json + wget -q https://huggingface.co/facebook/mms-tts/resolve/main/models/$name/vocab.txt + +Download MMS source code +------------------------ + +.. code-block:: bash + + git clone https://huggingface.co/spaces/mms-meta/MMS + export PYTHONPATH=$PWD/MMS:$PYTHONPATH + export PYTHONPATH=$PWD/MMS/vits:$PYTHONPATH + + pushd MMS/vits/monotonic_align + + python3 setup.py build + + ls -lh build/ + ls -lh build/lib*/ + ls -lh build/lib*/*/ + + cp build/lib*/vits/monotonic_align/core*.so . + + sed -i.bak s/.monotonic_align.core/.core/g ./__init__.py + popd + +Convert the model +----------------- + +Please save the following code into a file with name ``./vits-mms.py``: + +.. literalinclude:: ./code/vits-mms.py + +The you can run it with: + +.. code-block:: bash + + export PYTHONPATH=$PWD/MMS:$PYTHONPATH + export PYTHONPATH=$PWD/MMS/vits:$PYTHONPATH + export lang=eng + python3 ./vits-mms.py + +It will generate the following two files: + + - ``model.onnx`` + - ``tokens.txt`` + +Use the converted model +----------------------- + +We can use the converted model with the following command after installing +`sherpa-onnx`_. + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./model.onnx \ + --vits-tokens=./tokens.txt \ + --debug=1 \ + --output-filename=./mms-eng.wav \ + "How are you doing today? This is a text-to-speech application using models from facebook with next generation Kaldi" + +The above command should generate a wave file ``mms-eng.wav``. + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
mms-eng.wav + + + How are you doing today? This is a text-to-speech application using models from facebook with next generation Kaldi +
+ + +Congratulations! You have successfully converted a model from `MMS`_ and run it with `sherpa-onnx`_. + +We are using ``eng`` in this section as an example, you can replace it with other languages, such as +``deu`` for German, ``fra`` for French, etc. diff --git a/docs/source/onnx/tts/pic/hf-space.png b/docs/source/onnx/tts/pic/hf-space.png new file mode 100644 index 000000000..7b8a1d5bb Binary files /dev/null and b/docs/source/onnx/tts/pic/hf-space.png differ diff --git a/docs/source/onnx/tts/pic/utf8-setting.jpg b/docs/source/onnx/tts/pic/utf8-setting.jpg new file mode 100644 index 000000000..d1cabb03f Binary files /dev/null and b/docs/source/onnx/tts/pic/utf8-setting.jpg differ diff --git a/docs/source/onnx/tts/piper.rst b/docs/source/onnx/tts/piper.rst new file mode 100644 index 000000000..23122f674 --- /dev/null +++ b/docs/source/onnx/tts/piper.rst @@ -0,0 +1,128 @@ +Piper +===== + +In this section, we describe how to convert `piper`_ pre-trained models +from ``_. + +.. hint:: + + You can find ``all`` of the converted models from `piper`_ in the following address: + + ``_ + + If you want to convert your own pre-trained `piper`_ models or if you want to + learn how the conversion works, please read on. + + Otherwise, you only need to download the converted models from the above link. + +Note that there are pre-trained models for over 30 languages from `piper`_. All models +share the same converting method, so we use an American English model in this +section as an example. + +Install dependencies +-------------------- + +.. code-block:: bash + + pip install onnx onnxruntime + +.. hint:: + + We suggest that you always use the latest version of onnxruntime. + +Find the pre-trained model from piper +------------------------------------- + +All American English models from `piper`_ can be found at +``_. + +We use ``_ as +an example in this section. + +Download the pre-trained model +------------------------------ + +We need to download two files for each model: + +.. code-block:: bash + + wget https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/low/en_US-amy-low.onnx + wget https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/low/en_US-amy-low.onnx.json + +Add meta data to the onnx model +------------------------------- + +Please use the following code to add meta data to the downloaded onnx model. + +.. literalinclude:: ./code/piper.py + :language: python + +After running the above script, your ``en_US-amy-low.onnx`` is updated with +meta data and it also generates a new file ``tokens.txt``. + +From now on, you don't need the config json file ``en_US-amy-low.onnx.json`` any longer. + +Download espeak-ng-data +----------------------- + +.. code-block:: bash + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2 + tar xf espeak-ng-data.tar.bz2 + +Note that ``espeak-ng-data.tar.bz2`` is shared by all models from `piper`_, no matter +which language your are using for your model. + +Test your converted model +------------------------- + +To have a quick test of your converted model, you can use + +.. code-block:: bash + + pip install sherpa-onnx + +to install `sherpa-onnx`_ and then use the following commands to test your model: + +.. code-block:: bash + + # The command "pip install sherpa-onnx" will install several binaries, + # including the following one + + which sherpa-onnx-offline-tts + + sherpa-onnx-offline-tts \ + --vits-model=./en_US-amy-low.onnx \ + --vits-tokens=./tokens.txt \ + --vits-data-dir=./espeak-ng-data \ + --output-filename=./test.wav \ + "How are you doing? This is a text-to-speech application using next generation Kaldi." + +The above command should generate a wave file ``test.wav``. + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
test.wav + + + How are you doing? This is a text-to-speech application using next generation Kaldi. +
+ + +Congratulations! You have successfully converted a model from `piper`_ and run it with `sherpa-onnx`_. + + diff --git a/docs/source/onnx/tts/pretrained_models/index.rst b/docs/source/onnx/tts/pretrained_models/index.rst new file mode 100644 index 000000000..c0c34efbb --- /dev/null +++ b/docs/source/onnx/tts/pretrained_models/index.rst @@ -0,0 +1,20 @@ +.. _onnx-tts-pretrained-models: + +Pre-trained models +================== + +This page list pre-trained models for text-to-speech. + +.. hint:: + + Please install `git-lfs `_ before you continue. + + Otherwise, you will be ``SAD`` later. + +.. toctree:: + :maxdepth: 5 + + ./rtf + ./matcha + ./kokoro + ./vits diff --git a/docs/source/onnx/tts/pretrained_models/kokoro.rst b/docs/source/onnx/tts/pretrained_models/kokoro.rst new file mode 100644 index 000000000..1fd164f0f --- /dev/null +++ b/docs/source/onnx/tts/pretrained_models/kokoro.rst @@ -0,0 +1,1488 @@ +Kokoro +====== + +This page lists pre-trained models from ``_. + +.. _kokoro-multi-lang-v1_0: + +kokoro-multi-lang-v1_0 (Chinese + English, 53 speakers) +------------------------------------------------------- + +This model contains 53 speakers. The ONNX model is from +``_ + +.. hint:: + + If you want to convert the kokoro 1.0 onnx model to sherpa-onnx, please + see ``_ + +This model in sherpa-onnx supports both English and Chinese. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +.. warning:: + + It is a multi-lingual model, but we only add English and Chinese support for it. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 + tar xf kokoro-multi-lang-v1_0.tar.bz2 + rm kokoro-multi-lang-v1_0.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: + + ls -lh kokoro-multi-lang-v1_0/ + total 718872 + -rw-r--r-- 1 fangjun staff 11K Feb 7 10:16 LICENSE + -rw-r--r-- 1 fangjun staff 50B Feb 7 10:18 README.md + -rw-r--r-- 1 fangjun staff 58K Feb 7 10:18 date-zh.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 2024 dict + drwxr-xr-x 122 fangjun staff 3.8K Nov 28 2023 espeak-ng-data + -rw-r--r-- 1 fangjun staff 6.0M Feb 7 10:18 lexicon-gb-en.txt + -rw-r--r-- 1 fangjun staff 5.6M Feb 7 10:18 lexicon-us-en.txt + -rw-r--r-- 1 fangjun staff 2.3M Feb 7 10:18 lexicon-zh.txt + -rw-r--r-- 1 fangjun staff 310M Feb 7 10:18 model.onnx + -rw-r--r-- 1 fangjun staff 63K Feb 7 10:18 number-zh.fst + -rw-r--r-- 1 fangjun staff 87K Feb 7 10:18 phone-zh.fst + -rw-r--r-- 1 fangjun staff 687B Feb 7 10:18 tokens.txt + -rw-r--r-- 1 fangjun staff 26M Feb 7 10:18 voices.bin + +Map between speaker ID and speaker name +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This model contains 53 speakers and we use integer IDs ``0-52`` to represent +each speaker. + +Please visit ``_ to listen to +audio samples from different speakers. + +The map is given below: + + - **ID to Speaker** + + .. code-block:: + + 0->af_alloy, 1->af_aoede, 2->af_bella, 3->af_heart, 4->af_jessica, + 5->af_kore, 6->af_nicole, 7->af_nova, 8->af_river, 9->af_sarah, + 10->af_sky, 11->am_adam, 12->am_echo, 13->am_eric, 14->am_fenrir, + 15->am_liam, 16->am_michael, 17->am_onyx, 18->am_puck, 19->am_santa, + 20->bf_alice, 21->bf_emma, 22->bf_isabella, 23->bf_lily, 24->bm_daniel, + 25->bm_fable, 26->bm_george, 27->bm_lewis, 28->ef_dora, 29->em_alex, + 30->ff_siwis, 31->hf_alpha, 32->hf_beta, 33->hm_omega, 34->hm_psi, + 35->if_sara, 36->im_nicola, 37->jf_alpha, 38->jf_gongitsune, + 39->jf_nezumi, 40->jf_tebukuro, 41->jm_kumo, + 42->pf_dora, 43->pm_alex, 44->pm_santa, 45->zf_xiaobei, 46->zf_xiaoni, + 47->zf_xiaoxiao, 48->zf_xiaoyi,49->zm_yunjian, 50->zm_yunxi, + 51->zm_yunxia, 52->zm_yunyang, + + - **Speaker to ID** + + .. code-block:: + + af_alloy->0, af_aoede->1, af_bella->2, af_heart->3, af_jessica->4, + af_kore->5, af_nicole->6, af_nova->7, af_river->8, af_sarah->9, + af_sky->10, am_adam->11, am_echo->12, am_eric->13, am_fenrir->14, + am_liam->15, am_michael->16, am_onyx->17, am_puck->18, am_santa->19, + bf_alice->20, bf_emma->21, bf_isabella->22, bf_lily->23, bm_daniel->24, + bm_fable->25, bm_george->26, bm_lewis->27, ef_dora->28, em_alex->29, + ff_siwis->30, hf_alpha->31, hf_beta->32, hm_omega->33, hm_psi->34, + if_sara->35, im_nicola->36, jf_alpha->37, jf_gongitsune->38, + jf_nezumi->39, jf_tebukuro->40, jm_kumo->41, pf_dora->42, pm_alex->43, + pm_santa->44, zf_xiaobei->45, zf_xiaoni->46, zf_xiaoxiao->47, + zf_xiaoyi->48, zm_yunjian->49, zm_yunxi->50, zm_yunxia->51, + zm_yunyang->52 + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. container:: toggle + + .. container:: header + + Click ▶ to see it. + + .. code-block:: bash + + cd /path/to/sherpa-onnx + + for sid in $(seq 0 19); do + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=$sid \ + --output-filename="./kokoro-1.0-sid-$sid-en-us.wav" \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + + for sid in $(seq 20 27); do + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=$sid \ + --output-filename="./kokoro-1.0-sid-$sid-en-gb.wav" \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=23 \ + --output-filename="./kokoro-1.0-sid-23-en-gb.wav" \ + "Liliana, the most beautiful and lovely assistant of our team" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=24 \ + --output-filename="./kokoro-1.0-sid-24-en-gb.wav" \ + "Liliana, the most beautiful and lovely assistant of our team" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=45 \ + --output-filename="./kokoro-1.0-sid-45-zh.wav" \ + "小米的核心价值观是什么?答案是真诚热爱!" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=45 \ + --output-filename="./kokoro-1.0-sid-45-zh-1.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=46 \ + --output-filename="./kokoro-1.0-sid-46-zh.wav" \ + "小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=46 \ + --output-filename="./kokoro-1.0-sid-46-zh-1.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=47 \ + --output-filename="./kokoro-1.0-sid-47-zh.wav" \ + "35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=47 \ + --output-filename="./kokoro-1.0-sid-47-zh-1.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/phone-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=48 \ + --output-filename="./kokoro-1.0-sid-48-zh-1.wav" \ + "有困难,请拨打110 或者18601200909" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=48 \ + --output-filename="./kokoro-1.0-sid-48-zh-2.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/date-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=48 \ + --output-filename="./kokoro-1.0-sid-48-zh.wav" \ + "现在是2025年12点55分, 星期5。明天是周6,不用上班, 太棒啦!" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/date-zh.fst,./kokoro-multi-lang-v1_0/phone-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=49 \ + --output-filename="./kokoro-1.0-sid-49-zh.wav" \ + "根据第7次全国人口普查结果表明,我国总人口有1443497378人。普查登记的大陆31个省、自治区、直辖市和现役军人的人口共1411778724人。电话号码是110。手机号是13812345678" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=49 \ + --output-filename="./kokoro-1.0-sid-49-zh-1.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=50 \ + --output-filename="./kokoro-1.0-sid-50-zh.wav" \ + "林美丽最美丽、最漂亮、最可爱!" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=50 \ + --output-filename="./kokoro-1.0-sid-50-zh-1.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=51 \ + --output-filename="./kokoro-1.0-sid-51-zh.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=52 \ + --output-filename="./kokoro-1.0-sid-52-zh.wav" \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/date-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=52 \ + --output-filename="./kokoro-1.0-sid-52-zh-en.wav" \ + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过, I am very happy to be in China. 雷军事后在微博上表示 “万万没想到,视频火速传到国内,全国人民都笑了”. 现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/date-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=1 \ + --output-filename="./kokoro-1.0-sid-1-zh-en.wav" \ + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过, I am very happy to be in China. 雷军事后在微博上表示 “万万没想到,视频火速传到国内,全国人民都笑了”. 现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!" + + build/bin/sherpa-onnx-offline-tts \ + --debug=0 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/date-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --num-threads=2 \ + --sid=18 \ + --output-filename="./kokoro-1.0-sid-18-zh-en.wav" \ + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过, I am very happy to be in China. 雷军事后在微博上表示 “万万没想到,视频火速传到国内,全国人民都笑了”. 现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!" + +After running, it will generate many ``.wav`` files in the +current directory. + +Audio samples +::::::::::::: + +An example is given below: + +.. container:: toggle + + .. container:: header + + Click ▶ to see it. + + .. code-block:: + + soxi ./kokoro-1.0-sid-1-zh-en.wav + + Input File : './kokoro-1.0-sid-1-zh-en.wav' + Channels : 1 + Sample Rate : 24000 + Precision : 16-bit + Duration : 00:00:26.00 = 624008 samples ~ 1950.02 CDDA sectors + File Size : 1.25M + Bit Rate : 384k + Sample Encoding: 16-bit Signed Integer PCM + + .. hint:: + + Sample rate of this model is fixed to ``24000 Hz``. + + .. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
kokoro-1.0-sid-0-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-1-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-2-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-3-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-4-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-5-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-6-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-7-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-8-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-9-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-10-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-11-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-12-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-13-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-14-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-15-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-16-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-17-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-18-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-19-en-us.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-20-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-21-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-22-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-23-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-24-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-25-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-23-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-24-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-25-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-26-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-27-en-gb.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
kokoro-1.0-sid-45-zh.wav + + + "小米的核心价值观是什么?答案是真诚热爱!" +
kokoro-1.0-sid-45-zh-1.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-46-zh.wav + + + "小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。" +
kokoro-1.0-sid-46-zh-1.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-47-zh.wav + + + "35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。" +
kokoro-1.0-sid-47-zh-1.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-48-zh-1.wav + + + "有困难,请拨打110 或者18601200909" +
kokoro-1.0-sid-48-zh-2.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-48-zh.wav + + + "现在是2025年12点55分, 星期5。明天是周6,不用上班, 太棒啦!" +
kokoro-1.0-sid-49-zh.wav + + + "根据第7次全国人口普查结果表明,我国总人口有1443497378人。普查登记的大陆31个省、自治区、直辖市和现役军人的人口共1411778724人。电话号码是110。手机号是13812345678" + +
kokoro-1.0-sid-49-zh-1.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-50-zh.wav + + + "林美丽最美丽、最漂亮、最可爱!" +
kokoro-1.0-sid-50-zh-1.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-51-zh.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-52-zh.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
kokoro-1.0-sid-52-zh-en.wav + + + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过, I am very happy to be in China. 雷军事后在微博上表示 “万万没想到,视频火速传到国内,全国人民都笑了”. 现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!" +
kokoro-1.0-sid-1-zh-en.wav + + + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过, I am very happy to be in China. 雷军事后在微博上表示 “万万没想到,视频火速传到国内,全国人民都笑了”. 现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!" + +
kokoro-1.0-sid-18-zh-en.wav + + + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过, I am very happy to be in China. 雷军事后在微博上表示 “万万没想到,视频火速传到国内,全国人民都笑了”. 现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!" + +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Please replace ``build/bin/sherpa-onnx-offline-tts`` in the above examples +with ``python3 ./python-api-examples/offline-tts.py``. +or with ``python3 ./python-api-examples/offline-tts-play.py``. + +.. hint:: + + - Download `offline-tts.py `_ + - Download `offline-tts-play.py `_ + +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + + for t in 1 2 3 4; do + build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --tts-rule-fsts=./kokoro-multi-lang-v1_0/date-zh.fst,./kokoro-multi-lang-v1_0/number-zh.fst \ + --sid=1 \ + --output-filename="./kokoro-1.0-sid-1-en.wav" \ + "你好吗?Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 7.635 | 4.470 | 3.430 | 3.191 | + +-------------+-------+-------+-------+-------+ + +.. _kokoro-en-v0_19: + +kokoro-en-v0_19 (English, 11 speakers) +-------------------------------------- + +This model contains 11 speakers. The ONNX model is from +``_ + +The script for adding meta data to the ONNX model can be found at +``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: + + ls -lh kokoro-en-v0_19/ + + total 686208 + -rw-r--r-- 1 fangjun staff 11K Jan 15 16:23 LICENSE + -rw-r--r-- 1 fangjun staff 235B Jan 15 16:25 README.md + drwxr-xr-x 122 fangjun staff 3.8K Nov 28 2023 espeak-ng-data + -rw-r--r-- 1 fangjun staff 330M Jan 15 16:25 model.onnx + -rw-r--r-- 1 fangjun staff 1.1K Jan 15 16:25 tokens.txt + -rw-r--r-- 1 fangjun staff 5.5M Jan 15 16:25 voices.bin + +Map between speaker ID and speaker name +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The model contains 11 speakers and we use integer IDs ``0-10`` to represent. +each speaker. + +The map is given below: + +.. list-table:: + + * - Speaker ID + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + * - Speaker Name + - af + - af_bella + - af_nicole + - af_sarah + - af_sky + - am_adam + - am_michael + - bf_emma + - bf_isabella + - bm_george + - bm_lewis + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IDnameTest wave
0af + +
1af_bella + +
2af_nicole + +
3af_sarah + +
4af_sky + +
5am_adam + +
6am_michael + +
7bf_emma + +
8bf_isabella + +
9bm_george + +
10bm_lewis + +
+ +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --num-threads=2 \ + --sid=10 \ + --output-filename="./10-bm_lewis.wav" \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be, a statesman, a businessman, an official, or a scholar." + +After running, it will generate a file ``10-bm_lewis`` in the +current directory. + +.. code-block:: bash + + soxi ./10-bm_lewis.wav + + Input File : './10-bm_lewis.wav' + Channels : 1 + Sample Rate : 24000 + Precision : 16-bit + Duration : 00:00:15.80 = 379200 samples ~ 1185 CDDA sectors + File Size : 758k + Bit Rate : 384k + Sample Encoding: 16-bit Signed Integer PCM + +.. hint:: + + Sample rate of this model is fixed to ``24000 Hz``. + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
10-bm_lewis.wav + + + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be, a statesman, a businessman, an official, or a scholar." +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --num-threads=2 \ + --sid=2 \ + --output-filename=./2-af_nicole.wav \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + +.. code-block:: bash + + soxi ./2-af_nicole.wav + + Input File : './2-af_nicole.wav' + Channels : 1 + Sample Rate : 24000 + Precision : 16-bit + Duration : 00:00:11.45 = 274800 samples ~ 858.75 CDDA sectors + File Size : 550k + Bit Rate : 384k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
2-af_nicole.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + + for t in 1 2 3 4; do + build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --sid=2 \ + --output-filename=./2-af_nicole.wav \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 6.629 | 3.870 | 2.999 | 2.774 | + +-------------+-------+-------+-------+-------+ diff --git a/docs/source/onnx/tts/pretrained_models/matcha.rst b/docs/source/onnx/tts/pretrained_models/matcha.rst new file mode 100644 index 000000000..9e8a06a0d --- /dev/null +++ b/docs/source/onnx/tts/pretrained_models/matcha.rst @@ -0,0 +1,432 @@ +Matcha +====== + +This page lists pre-trained models using `Matcha-TTS `_. + +.. caution:: + + Models are from `icefall `_. + + We don't support models from ``_. + +.. _matcha-icefall-en_US-ljspeech: + +matcha-icefall-en_US-ljspeech (American English, 1 female speaker) +------------------------------------------------------------------ + +This model is trained using + + ``_ + +The dataset used to train the model is from + + ``_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx + +.. caution:: + + Remember to also download the vocoder model. We use `vocos-22khz-univ.onnx `_ in the example. + You can also select `hifigan_v1 `_, + `hifigan_v2 `_, or + `hifigan_v3 `_. + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + ls -lh matcha-icefall-en_US-ljspeech/ + total 144856 + -rw-r--r-- 1 fangjun staff 251B Jan 2 11:05 README.md + drwxr-xr-x 122 fangjun staff 3.8K Nov 28 2023 espeak-ng-data + -rw-r--r--@ 1 fangjun staff 71M Jan 2 04:04 model-steps-3.onnx + -rw-r--r-- 1 fangjun staff 954B Jan 2 11:05 tokens.txt + + ls -lh vocos-22khz-univ.onnx + -rw-r--r-- 1 fangjun staff 51M Mar 17 15:28 vocos-22khz-univ.onnx + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --num-threads=2 \ + --output-filename=./matcha-ljspeech-0.wav \ + --debug=1 \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + +After running, it will generate a file ``matcha-ljspeech-0.wav`` in the +current directory. + +.. code-block:: bash + + soxi ./matcha-ljspeech-0.wav + + Input File : './matcha-ljspeech-0.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:15.06 = 332032 samples ~ 1129.36 CDDA sectors + File Size : 664k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
matcha-ljspeech-0.wav + + + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --num-threads=2 \ + --output-filename=./matcha-ljspeech-1.wav \ + --debug=1 \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + +.. code-block:: + + soxi ./matcha-ljspeech-1.wav + + Input File : './matcha-ljspeech-1.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:07.92 = 174592 samples ~ 593.85 CDDA sectors + File Size : 349k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
matcha-ljspeech-1.wav + + + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + + for t in 1 2 3 4; do + build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --output-filename=./matcha-ljspeech-1.wav \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 0.941 | 0.561 | 0.451 | 0.411 | + +-------------+-------+-------+-------+-------+ + + +.. _matcha-icefall-zh-baker: + +matcha-icefall-zh-baker (Chinese, 1 female speaker) +--------------------------------------------------- + +This model is trained using + + ``_ + +The dataset used to train the model is from + + ``_. + +.. caution:: + + The dataset is for ``non-commercial`` use only. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx + +.. caution:: + + Remember to also download the vocoder model. We use `vocos-22khz-univ.onnx `_ in the example. + You can also select `hifigan_v1 `_, + `hifigan_v2 `_, or + `hifigan_v3 `_. + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + ls -lh matcha-icefall-zh-baker/ + total 167344 + -rw-r--r-- 1 fangjun staff 370B Dec 31 14:51 README.md + -rw-r--r-- 1 fangjun staff 58K Dec 31 14:51 date.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 2024 dict + -rw-r--r-- 1 fangjun staff 1.3M Dec 31 14:51 lexicon.txt + -rw-r--r-- 1 fangjun staff 72M Dec 31 14:51 model-steps-3.onnx + -rw-r--r-- 1 fangjun staff 63K Dec 31 14:51 number.fst + -rw-r--r-- 1 fangjun staff 87K Dec 31 14:51 phone.fst + -rw-r--r-- 1 fangjun staff 19K Dec 31 14:51 tokens.txt + + ls -lh vocos-22khz-univ.onnx + -rw-r--r-- 1 fangjun staff 51M Mar 17 15:28 vocos-22khz-univ.onnx + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ + --num-threads=2 \ + --output-filename=./matcha-baker-0.wav \ + --debug=1 \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + + ./build/bin/sherpa-onnx-offline-tts \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ + --output-filename=./matcha-baker-1.wav \ + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" + +After running, it will generate two files, ``matcha-baker-0.wav`` and +``matcha-baker-1.wav``, in the current directory. + +.. code-block:: bash + + soxi matcha-baker-*.wav + + Input File : 'matcha-baker-0.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:22.65 = 499456 samples ~ 1698.83 CDDA sectors + File Size : 999k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'matcha-baker-1.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:22.65 = 499456 samples ~ 1698.83 CDDA sectors + File Size : 999k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 2 files: 00:00:45.30 + +.. raw:: html + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
matcha-baker-0.wav + + + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." +
matcha-baker-1.wav + + + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ + --output-filename=./matcha-baker-2.wav \ + --debug=1 \ + "三百六十行,行行出状元。你行的!明天就是 2025年1月1号啦!银行卡被卡住了,你帮个忙,行不行?" + +After running, it will generate a file ``matcha-baker-zh-2.wav`` in the current directory. + +.. code-block:: bash + + soxi matcha-baker-2.wav + + Input File : 'matcha-baker-2.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:12.71 = 280320 samples ~ 953.469 CDDA sectors + File Size : 561k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
matcha-baker-2.wav + + + "三百六十行,行行出状元。你行的!明天就是 2025年1月1号啦!银行卡被卡住了,你帮个忙,行不行?" +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + + for t in 1 2 3 4; do + build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./vocos-22khz-univ.onnx \ + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ + --output-filename=./matcha-baker-0.wav \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受 着生命的奇迹与温柔." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 0.892 | 0.536 | 0.432 | 0.391 | + +-------------+-------+-------+-------+-------+ + diff --git a/docs/source/onnx/tts/pretrained_models/rtf.rst b/docs/source/onnx/tts/pretrained_models/rtf.rst new file mode 100644 index 000000000..dafa70b2c --- /dev/null +++ b/docs/source/onnx/tts/pretrained_models/rtf.rst @@ -0,0 +1,112 @@ +RTF of pre-trained models +========================== + +The following table lists the RTF of pre-trained models on +``Raspberry Pi 4 Model B Rev 1.5``. + +.. list-table:: + + * - Number of threads + - 1 + - 2 + - 3 + - 4 + - + * - :ref:`vits-melo-tts-zh_en` + - 6.727 + - 3.877 + - 2.914 + - 2.518 + - 163 MB + * - :ref:`vits-piper-en_US-glados` + - 0.812 + - 0.480 + - 0.391 + - 0.349 + - 61 MB + * - :ref:`vits-piper-en_US-libritts_r-medium` + - 0.790 + - 0.493 + - 0.392 + - 0.357 + - 75 MB + * - :ref:`vits-model-vits-ljspeech` + - 6.057 + - 3.517 + - 2.535 + - 2.206 + - 109 MB + * - :ref:`vits-model-vits-vctk` + - 6.079 + - 3.483 + - 2.537 + - 2.226 + - 116 MB + * - :ref:`sherpa-onnx-vits-zh-ll` + - 4.275 + - 2.494 + - 1.840 + - 1.593 + - 116 MB + * - :ref:`vits-zh-hf-fanchen-C` + - 4.306 + - 2.451 + - 1.846 + - 1.600 + - 116 MB + * - :ref:`vits-zh-hf-fanchen-wnj` + - 4.276 + - 2.505 + - 1.827 + - 1.608 + - 116 MB + * - :ref:`vits-zh-hf-theresa` + - 6.032 + - 3.448 + - 2.566 + - 2.210 + - 117 MB + * - :ref:`vits-zh-hf-eula` + - 6.011 + - 3.473 + - 2.537 + - 2.231 + - 117 MB + * - :ref:`vits-model-aishell3` + - 0.365 + - 0.220 + - 0.171 + - 0.156 + - 30 MB + * - :ref:`vits-model-en_US-lessac-medium` + - 0.774 + - 0.482 + - 0.390 + - 0.357 + - 61 MB + * - :ref:`matcha-icefall-zh-baker` + - 0.892 + - 0.536 + - 0.432 + - 0.391 + - 73 MB + * - :ref:`matcha-icefall-en_US-ljspeech` + - 0.941 + - 0.561 + - 0.451 + - 0.411 + - 71 MB + * - :ref:`kokoro-en-v0_19` + - 6.629 + - 3.870 + - 2.999 + - 2.774 + - 330 MB + * - :ref:`kokoro-multi-lang-v1_0` + - 7.635 + - 4.470 + - 3.430 + - 3.191 + - 311 MB + + diff --git a/docs/source/onnx/tts/pretrained_models/vits.rst b/docs/source/onnx/tts/pretrained_models/vits.rst new file mode 100644 index 000000000..53fca2c15 --- /dev/null +++ b/docs/source/onnx/tts/pretrained_models/vits.rst @@ -0,0 +1,2450 @@ +vits +==== + +This page lists pre-trained `vits`_ models. + +All models in a single table +----------------------------- + +The following table summarizes the information of all models in this page. + +.. note:: + + Since there are more than ``100`` pre-trained models for over ``40`` languages, + we don't list all of them on this page. Please find them at + ``_. + + You can try all the models at the following huggingface space. + ``_. + +.. hint:: + + You can find Android APKs for each model at the following page + + ``_ + +.. list-table:: + + * - Model + - Language + - # Speakers + - Dataset + - Model filesize (MB) + - Sample rate (Hz) + * - :ref:`vits-melo-tts-zh_en` + - Chinese + English + - 1 + - N/A + - 163 + - 44100 + * - :ref:`vits-piper-en_US-libritts_r-medium` + - English + - 904 + - `LibriTTS-R`_ + - 75 + - 22050 + * - :ref:`vits-piper-en_US-glados` + - English + - 1 + - N/A + - 61 + - 22050 + * - :ref:`sherpa-onnx-vits-zh-ll` + - Chinese + - 5 + - N/A + - 115 + - 16000 + * - :ref:`vits-zh-hf-fanchen-C` + - Chinese + - 187 + - N/A + - 116 + - 16000 + * - :ref:`vits-zh-hf-fanchen-wnj` + - Chinese + - 1 + - N/A + - 116 + - 16000 + * - :ref:`vits-zh-hf-theresa` + - Chinese + - 804 + - N/A + - 117 + - 22050 + * - :ref:`vits-zh-hf-eula` + - Chinese + - 804 + - N/A + - 117 + - 22050 + * - :ref:`vits-model-aishell3` + - Chinese + - 174 + - `aishell3`_ + - 116 + - 8000 + * - :ref:`vits-model-vits-ljspeech` + - English (US) + - 1 (Female) + - `LJ Speech`_ + - 109 + - 22050 + * - :ref:`vits-model-vits-vctk` + - English + - 109 + - `VCTK`_ + - 116 + - 22050 + * - :ref:`vits-model-en_US-lessac-medium` + - English (US) + - 1 (Male) + - `lessac_blizzard2013`_ + - 61 + - 22050 + +.. _vits-melo-tts-zh_en: + +vits-melo-tts-zh_en (Chinese + English, 1 speaker) +-------------------------------------------------- + +This model is converted from ``_ +and it supports only 1 speaker. It supports both Chinese and English. + +Note that if you input English words, only those that are present in the ``lexicon.txt`` +can be pronounced. Please refer to +``_ +for how to add new words. + +.. hint:: + + The converting script is available at + ``_ + + You can convert more models from ``_ + by yourself. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2 + tar xvf vits-melo-tts-zh_en.tar.bz2 + rm vits-melo-tts-zh_en.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + ls -lh vits-melo-tts-zh_en/ + total 346848 + -rw-r--r-- 1 fangjun staff 1.0K Jul 16 13:38 LICENSE + -rw-r--r-- 1 fangjun staff 156B Jul 16 13:38 README.md + -rw-r--r-- 1 fangjun staff 58K Jul 16 13:38 date.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 20:42 dict + -rw-r--r-- 1 fangjun staff 6.5M Jul 16 13:38 lexicon.txt + -rw-r--r-- 1 fangjun staff 163M Jul 16 13:38 model.onnx + -rw-r--r-- 1 fangjun staff 63K Jul 16 13:38 number.fst + -rw-r--r-- 1 fangjun staff 87K Jul 16 13:38 phone.fst + -rw-r--r-- 1 fangjun staff 655B Jul 16 13:38 tokens.txt + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-melo-tts-zh_en/model.onnx \ + --vits-lexicon=./vits-melo-tts-zh_en/lexicon.txt \ + --vits-tokens=./vits-melo-tts-zh_en/tokens.txt \ + --vits-dict-dir=./vits-melo-tts-zh_en/dict \ + --output-filename=./zh-en-0.wav \ + "This is a 中英文的 text to speech 测试例子。" + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-melo-tts-zh_en/model.onnx \ + --vits-lexicon=./vits-melo-tts-zh_en/lexicon.txt \ + --vits-tokens=./vits-melo-tts-zh_en/tokens.txt \ + --vits-dict-dir=./vits-melo-tts-zh_en/dict \ + --output-filename=./zh-en-1.wav \ + "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。" + + ./build/bin/sherpa-onnx-offline-tts-play \ + --vits-model=./vits-melo-tts-zh_en/model.onnx \ + --vits-lexicon=./vits-melo-tts-zh_en/lexicon.txt \ + --vits-tokens=./vits-melo-tts-zh_en/tokens.txt \ + --tts-rule-fsts="./vits-melo-tts-zh_en/date.fst,./vits-melo-tts-zh_en/number.fst" \ + --vits-dict-dir=./vits-melo-tts-zh_en/dict \ + --output-filename=./zh-en-2.wav \ + "Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过 I am very happy to be in China.雷军事后在微博上表示「万万没想到,视频火速传到国内,全国人民都笑了」、「现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!」" + + +After running, it will generate three files ``zh-en-1.wav``, +``zh-en-2.wav``, and ``zh-en-3.wav`` in the current directory. + +.. code-block:: bash + + soxi zh-en-*.wav + + Input File : 'zh-en-0.wav' + Channels : 1 + Sample Rate : 44100 + Precision : 16-bit + Duration : 00:00:03.54 = 156160 samples = 265.578 CDDA sectors + File Size : 312k + Bit Rate : 706k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'zh-en-1.wav' + Channels : 1 + Sample Rate : 44100 + Precision : 16-bit + Duration : 00:00:05.98 = 263680 samples = 448.435 CDDA sectors + File Size : 527k + Bit Rate : 706k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'zh-en-2.wav' + Channels : 1 + Sample Rate : 44100 + Precision : 16-bit + Duration : 00:00:18.92 = 834560 samples = 1419.32 CDDA sectors + File Size : 1.67M + Bit Rate : 706k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 3 files: 00:00:28.44 + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
zh-en-0.wav + + + This is a 中英文的 text to speech 测试例子。 +
zh-en-1.wav + + + 我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。 +
zh-en-2.wav + + + Are you ok 是雷军2015年4月小米在印度举行新品发布会时说的。他还说过 I am very happy to be in China.雷军事后在微博上表示「万万没想到,视频火速传到国内,全国人民都笑了」、「现在国际米粉越来越多,我的确应该把英文学好,不让大家失望!加油!」 +
+ + +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts-play.py \ + --vits-model=./vits-melo-tts-zh_en/model.onnx \ + --vits-lexicon=./vits-melo-tts-zh_en/lexicon.txt \ + --vits-tokens=./vits-melo-tts-zh_en/tokens.txt \ + --vits-dict-dir=./vits-melo-tts-zh_en/dict \ + --output-filename=./zh-en-3.wav \ + "它也支持繁体字. 我相信你們一定聽過愛迪生說過的這句話Genius is one percent inspiration and ninety-nine percent perspiration. " + +After running, it will generate a file ``zh-en-3.wav`` in the current directory. + +.. code-block:: bash + + soxi zh-en-3.wav + + Input File : 'zh-en-3.wav' + Channels : 1 + Sample Rate : 44100 + Precision : 16-bit + Duration : 00:00:09.83 = 433664 samples = 737.524 CDDA sectors + File Size : 867k + Bit Rate : 706k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
zh-en-3.wav + + + 它也支持繁体字. 我相信你們一定聽過愛迪生說過的這句話Genius is one percent inspiration and ninety-nine percent perspiration. +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-melo-tts-zh_en/model.onnx \ + --vits-lexicon=./vits-melo-tts-zh_en/lexicon.txt \ + --vits-tokens=./vits-melo-tts-zh_en/tokens.txt \ + --vits-dict-dir=./vits-melo-tts-zh_en/dict \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与 温柔." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 6.727 | 3.877 | 2.914 | 2.518 | + +-------------+-------+-------+-------+-------+ + +.. _vits-piper-en_US-glados: + +vits-piper-en_US-glados (English, 1 speaker) +-------------------------------------------- + +This model is converted from ``_ +and it supports only English. + +See also ``_ . + +If you are interested in how the model is converted to `sherpa-onnx`_, please see +the following colab notebook: + + ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-glados.tar.bz2 + tar xvf vits-piper-en_US-glados.tar.bz2 + rm vits-piper-en_US-glados.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + ls -lh vits-piper-en_US-glados/ + + -rw-r--r-- 1 fangjun staff 242B Dec 13 2023 README.md + -rw-r--r-- 1 fangjun staff 61M Dec 13 2023 en_US-glados.onnx + drwxr-xr-x 122 fangjun staff 3.8K Dec 13 2023 espeak-ng-data + -rw-r--r-- 1 fangjun staff 940B Dec 13 2023 tokens.txt + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-glados/en_US-glados.onnx\ + --vits-tokens=./vits-piper-en_US-glados/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-glados/espeak-ng-data \ + --output-filename=./glados-liliana.wav \ + "liliana, the most beautiful and lovely assistant of our team!" + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-glados/en_US-glados.onnx\ + --vits-tokens=./vits-piper-en_US-glados/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-glados/espeak-ng-data \ + --output-filename=./glados-code.wav \ + "Talk is cheap. Show me the code." + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-glados/en_US-glados.onnx\ + --vits-tokens=./vits-piper-en_US-glados/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-glados/espeak-ng-data \ + --output-filename=./glados-men.wav \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + +After running, it will generate 3 files ``glados-liliana.wav``, +``glados-code.wav``, and ``glados-men.wav`` in the current directory. + +.. code-block:: bash + + soxi glados*.wav + + Input File : 'glados-code.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:02.18 = 48128 samples ~ 163.701 CDDA sectors + File Size : 96.3k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'glados-liliana.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.97 = 87552 samples ~ 297.796 CDDA sectors + File Size : 175k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'glados-men.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:15.31 = 337664 samples ~ 1148.52 CDDA sectors + File Size : 675k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 3 files: 00:00:21.47 + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
glados-liliana.wav + + + liliana, the most beautiful and lovely assistant of our team! +
glados-code.wav + + + Talk is cheap. Show me the code. +
glados-men.wav + + + Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar. +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-piper-en_US-glados/en_US-glados.onnx\ + --vits-tokens=./vits-piper-en_US-glados/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-glados/espeak-ng-data \ + --output-filename=./glados-ship.wav \ + "A ship in port is safe, but that's not what ships are built for." + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-piper-en_US-glados/en_US-glados.onnx\ + --vits-tokens=./vits-piper-en_US-glados/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-glados/espeak-ng-data \ + --output-filename=./glados-bug.wav \ + "Given enough eyeballs, all bugs are shallow." + +After running, it will generate two files ``glados-ship.wav`` +and ``glados-bug.wav`` in the current directory. + +.. code-block:: bash + + soxi ./glados-{ship,bug}.wav + + Input File : './glados-ship.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.74 = 82432 samples ~ 280.381 CDDA sectors + File Size : 165k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : './glados-bug.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:02.67 = 58880 samples ~ 200.272 CDDA sectors + File Size : 118k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 2 files: 00:00:06.41 + +.. raw:: html + + + + + + + + + + + + + + + + + +
Wave filenameContentText
glados-ship.wav + + + A ship in port is safe, but that's not what ships are built for. +
glados-bug.wav + + + Given enough eyeballs, all bugs are shallow. +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-piper-en_US-glados/en_US-glados.onnx\ + --vits-tokens=./vits-piper-en_US-glados/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-glados/espeak-ng-data \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 0.812 | 0.480 | 0.391 | 0.349 | + +-------------+-------+-------+-------+-------+ + +.. _vits-piper-en_US-libritts_r-medium: + +vits-piper-en_US-libritts_r-medium (English, 904 speakers) +---------------------------------------------------------- + +This model is converted from ``_ +and it supports 904 speakers. It supports only English. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 + tar xvf vits-piper-en_US-libritts_r-medium.tar.bz2 + rm vits-piper-en_US-libritts_r-medium.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + ls -lh vits-piper-en_US-libritts_r-medium/ + total 153552 + -rw-r--r-- 1 fangjun staff 279B Nov 29 2023 MODEL_CARD + -rw-r--r-- 1 fangjun staff 75M Nov 29 2023 en_US-libritts_r-medium.onnx + -rw-r--r-- 1 fangjun staff 20K Nov 29 2023 en_US-libritts_r-medium.onnx.json + drwxr-xr-x 122 fangjun staff 3.8K Nov 28 2023 espeak-ng-data + -rw-r--r-- 1 fangjun staff 954B Nov 29 2023 tokens.txt + -rwxr-xr-x 1 fangjun staff 1.8K Nov 29 2023 vits-piper-en_US.py + -rwxr-xr-x 1 fangjun staff 730B Nov 29 2023 vits-piper-en_US.sh + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx \ + --vits-tokens=./vits-piper-en_US-libritts_r-medium/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-libritts_r-medium/espeak-ng-data \ + --output-filename=./libritts-liliana-109.wav \ + --sid=109 \ + "liliana, the most beautiful and lovely assistant of our team!" + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx \ + --vits-tokens=./vits-piper-en_US-libritts_r-medium/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-libritts_r-medium/espeak-ng-data \ + --output-filename=./libritts-liliana-900.wav \ + --sid=900 \ + "liliana, the most beautiful and lovely assistant of our team!" + +After running, it will generate two files ``libritts-liliana-109.wav`` +and ``libritts-liliana-900.wav`` in the current directory. + +.. code-block:: bash + + soxi libritts-liliana-*.wav + + Input File : 'libritts-liliana-109.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:02.73 = 60160 samples ~ 204.626 CDDA sectors + File Size : 120k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : 'libritts-liliana-900.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.36 = 73984 samples ~ 251.646 CDDA sectors + File Size : 148k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 2 files: 00:00:06.08 + +.. raw:: html + + + + + + + + + + + + + + + + + +
Wave filenameContentText
libritts-liliana-109.wav + + + liliana, the most beautiful and lovely assistant of our team! +
libritts-liliana-900.wav + + + liliana, the most beautiful and lovely assistant of our team! +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx \ + --vits-tokens=./vits-piper-en_US-libritts_r-medium/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-libritts_r-medium/espeak-ng-data \ + --sid=200 \ + --output-filename=./libritts-armstrong-200.wav \ + "That's one small step for a man, a giant leap for mankind." + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx \ + --vits-tokens=./vits-piper-en_US-libritts_r-medium/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-libritts_r-medium/espeak-ng-data \ + --sid=500 \ + --output-filename=./libritts-armstrong-500.wav \ + "That's one small step for a man, a giant leap for mankind." + +After running, it will generate two files ``libritts-armstrong-200.wav`` +and ``libritts-armstrong-500.wav`` in the current directory. + +.. code-block:: bash + + soxi ./libritts-armstrong*.wav + + Input File : './libritts-armstrong-200.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.11 = 68608 samples ~ 233.361 CDDA sectors + File Size : 137k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + + Input File : './libritts-armstrong-500.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.42 = 75520 samples ~ 256.871 CDDA sectors + File Size : 151k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + + Total Duration of 2 files: 00:00:06.54 + +.. raw:: html + + + + + + + + + + + + + + + + + +
Wave filenameContentText
libritts-armstrong-200.wav + + + That's one small step for a man, a giant leap for mankind. +
libritts-armstrong-500.wav + + + That's one small step for a man, a giant leap for mankind. +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx \ + --vits-tokens=./vits-piper-en_US-libritts_r-medium/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-libritts_r-medium/espeak-ng-data \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 0.790 | 0.493 | 0.392 | 0.357 | + +-------------+-------+-------+-------+-------+ + +.. _vits-model-vits-ljspeech: + +ljspeech (English, single-speaker) +---------------------------------- + +This model is converted from `pretrained_ljspeech.pth `_, +which is trained by the `vits`_ author `Jaehyeon Kim `_ on +the `LJ Speech`_ dataset. It supports only English and is a single-speaker model. + +.. note:: + + If you are interested in how the model is converted, please see + ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-ljs.tar.bz2 + tar xvf vits-ljs.tar.bz2 + rm vits-ljs.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + -rw-r--r-- 1 1001 127 109M Apr 22 02:38 vits-ljs/vits-ljs.onnx + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-ljs/vits-ljs.onnx \ + --vits-lexicon=./vits-ljs/lexicon.txt \ + --vits-tokens=./vits-ljs/tokens.txt \ + --output-filename=./liliana.wav \ + "liliana, the most beautiful and lovely assistant of our team!" + +After running, it will generate a file ``liliana.wav`` in the current directory. + +.. code-block:: bash + + soxi ./liliana.wav + + Input File : './liliana.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:04.39 = 96768 samples ~ 329.143 CDDA sectors + File Size : 194k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
liliana.wav + + + liliana, the most beautiful and lovely assistant of our team! +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-ljs/vits-ljs.onnx \ + --vits-lexicon=./vits-ljs/lexicon.txt \ + --vits-tokens=./vits-ljs/tokens.txt \ + --output-filename=./armstrong.wav \ + "That's one small step for a man, a giant leap for mankind." + +After running, it will generate a file ``armstrong.wav`` in the current directory. + +.. code-block:: bash + + soxi ./armstrong.wav + + Input File : './armstrong.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:04.81 = 105984 samples ~ 360.49 CDDA sectors + File Size : 212k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
armstrong.wav + + + That's one small step for a man, a giant leap for mankind. +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-ljs/vits-ljs.onnx \ + --vits-lexicon=./vits-ljs/lexicon.txt \ + --vits-tokens=./vits-ljs/tokens.txt \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 6.057 | 3.517 | 2.535 | 2.206 | + +-------------+-------+-------+-------+-------+ + +.. _vits-model-vits-vctk: + +VCTK (English, multi-speaker, 109 speakers) +------------------------------------------- + +This model is converted from `pretrained_vctk.pth `_, +which is trained by the `vits`_ author `Jaehyeon Kim `_ on +the `VCTK`_ dataset. It supports only English and is a multi-speaker model. It contains +109 speakers. + +.. note:: + + If you are interested in how the model is converted, please see + ``_ + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 + tar xvf vits-vctk.tar.bz2 + rm vits-vctk.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + vits-vctk fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 37M Oct 16 10:57 vits-vctk.int8.onnx + -rw-r--r-- 1 fangjun staff 116M Oct 16 10:57 vits-vctk.onnx + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since there are 109 speakers available, we can choose a speaker from 0 to 198. +The default speaker ID is 0. + +We use speaker ID 0, 10, and 108 below to generate audio for the same text. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=0 \ + --output-filename=./kennedy-0.wav \ + "Ask not what your country can do for you; ask what you can do for your country." + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=10 \ + --output-filename=./kennedy-10.wav \ + "Ask not what your country can do for you; ask what you can do for your country." + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=108 \ + --output-filename=./kennedy-108.wav \ + "Ask not what your country can do for you; ask what you can do for your country." + +It will generate 3 files: ``kennedy-0.wav``, ``kennedy-10.wav``, and ``kennedy-108.wav``. + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
kennedy-0.wav + + + Ask not what your country can do for you; ask what you can do for your country. +
kennedy-10.wav + + + Ask not what your country can do for you; ask what you can do for your country. +
kennedy-108.wav + + + Ask not what your country can do for you; ask what you can do for your country. +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use speaker ID 30, 66, and 99 below to generate audio for different transcripts. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=30 \ + --output-filename=./einstein-30.wav \ + "Life is like riding a bicycle. To keep your balance, you must keep moving." + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=66 \ + --output-filename=./franklin-66.wav \ + "Three can keep a secret, if two of them are dead." + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + --sid=99 \ + --output-filename=./martin-99.wav \ + "Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that" + +It will generate 3 files: ``einstein-30.wav``, ``franklin-66.wav``, and ``martin-99.wav``. + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
einstein-30.wav + + + Life is like riding a bicycle. To keep your balance, you must keep moving. +
franklin-66.wav + + + Three can keep a secret, if two of them are dead. +
martin-99.wav + + + Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-vctk/vits-vctk.onnx \ + --vits-lexicon=./vits-vctk/lexicon.txt \ + --vits-tokens=./vits-vctk/tokens.txt \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 6.079 | 3.483 | 2.537 | 2.226 | + +-------------+-------+-------+-------+-------+ + +.. _sherpa-onnx-vits-zh-ll: + +csukuangfj/sherpa-onnx-vits-zh-ll (Chinese, 5 speakers) +------------------------------------------------------- + +You can download the model using the following commands:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 + tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 + rm sherpa-onnx-vits-zh-ll.tar.bz2 + +.. hint:: + + This model is trained with the following framework + + ``_ + +Please check the file sizes of the downloaded model: + +.. code-block:: bash + + ls -lh sherpa-onnx-vits-zh-ll/ + + -rw-r--r-- 1 fangjun staff 2.3K Apr 25 17:58 G_multisperaker_latest.json + -rw-r-----@ 1 fangjun staff 2.2K Apr 25 17:22 G_multisperaker_latest_low.json + -rw-r--r-- 1 fangjun staff 127B Apr 25 17:58 README.md + -rw-r--r-- 1 fangjun staff 58K Apr 25 17:58 date.fst + drwxr-xr-x 9 fangjun staff 288B Jun 21 16:32 dict + -rw-r--r-- 1 fangjun staff 368K Apr 25 17:58 lexicon.txt + -rw-r--r-- 1 fangjun staff 115M Apr 25 17:58 model.onnx + -rw-r--r-- 1 fangjun staff 21K Apr 25 17:58 new_heteronym.fst + -rw-r--r-- 1 fangjun staff 63K Apr 25 17:58 number.fst + -rw-r--r-- 1 fangjun staff 87K Apr 25 17:58 phone.fst + -rw-r--r-- 1 fangjun staff 331B Apr 25 17:58 tokens.txt + +**usage**: + +.. code-block:: bash + + sherpa-onnx-offline-tts \ + --vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \ + --vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \ + --vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \ + --vits-length-scale=0.5 \ + --sid=0 \ + --output-filename="./0-value-2x.wav" \ + "小米的核心价值观是什么?答案是真诚热爱!" + + + sherpa-onnx-offline-tts \ + --vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \ + --vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \ + --vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \ + --sid=1 \ + --tts-rule-fsts=./sherpa-onnx-vits-zh-ll/number.fst \ + --output-filename="./1-numbers.wav" \ + "小米有14岁了" + + sherpa-onnx-offline-tts \ + --vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \ + --vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \ + --vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \ + --tts-rule-fsts=./sherpa-onnx-vits-zh-ll/phone.fst,./sherpa-onnx-vits-zh-ll/number.fst \ + --sid=2 \ + --output-filename="./2-numbers.wav" \ + "有困难,请拨打110 或者18601200909" + + sherpa-onnx-offline-tts \ + --vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \ + --vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \ + --vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \ + --sid=3 \ + --output-filename="./3-wo-mi.wav" \ + "小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。" + + sherpa-onnx-offline-tts \ + --vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \ + --vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \ + --vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \ + --tts-rule-fsts=./sherpa-onnx-vits-zh-ll/number.fst \ + --sid=4 \ + --output-filename="./4-heteronym.wav" \ + "35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。" + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
0-value-2x.wav + + + 小米的核心价值观是什么?答案是真诚热爱! +
1-numbers.wav + + + 小米有14岁了 +
2-numbers.wav + + + 有困难,请拨打110 或者18601200909 +
3-wo-mi.wav + + + 小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。 +
4-heteronym.wav + + + 35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。 +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./sherpa-onnx-vits-zh-ll/model.onnx \ + --vits-dict-dir=./sherpa-onnx-vits-zh-ll/dict \ + --vits-lexicon=./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --vits-tokens=./sherpa-onnx-vits-zh-ll/tokens.txt \ + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔.' + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 4.275 | 2.494 | 1.840 | 1.593 | + +-------------+-------+-------+-------+-------+ + +.. _vits-zh-hf-fanchen-C: + +csukuangfj/vits-zh-hf-fanchen-C (Chinese, 187 speakers) +------------------------------------------------------- + +You can download the model using the following commands:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-hf-fanchen-C.tar.bz2 + tar xvf vits-zh-hf-fanchen-C.tar.bz2 + rm vits-zh-hf-fanchen-C.tar.bz2 + +.. hint:: + + This model is converted from + ``_ + +.. code-block:: bash + + # information about model files + + total 291M + -rw-r--r-- 1 1001 127 58K Apr 21 05:40 date.fst + drwxr-xr-x 3 1001 127 4.0K Apr 19 12:42 dict + -rwxr-xr-x 1 1001 127 4.0K Apr 21 05:40 export-onnx-zh-hf-fanchen-models.py + -rwxr-xr-x 1 1001 127 2.5K Apr 21 05:40 generate-lexicon-zh-hf-fanchen-models.py + -rw-r--r-- 1 1001 127 2.4M Apr 21 05:40 lexicon.txt + -rw-r--r-- 1 1001 127 22K Apr 21 05:40 new_heteronym.fst + -rw-r--r-- 1 1001 127 63K Apr 21 05:40 number.fst + -rw-r--r-- 1 1001 127 87K Apr 21 05:40 phone.fst + -rw-r--r-- 1 1001 127 173M Apr 21 05:40 rule.far + -rw-r--r-- 1 1001 127 331 Apr 21 05:40 tokens.txt + -rw-r--r-- 1 1001 127 116M Apr 21 05:40 vits-zh-hf-fanchen-C.onnx + -rwxr-xr-x 1 1001 127 2.0K Apr 21 05:40 vits-zh-hf-fanchen-models.sh + +**usage**: + +.. code-block:: bash + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --vits-length-scale=0.5 \ + --output-filename="./value-2x.wav" \ + "小米的核心价值观是什么?答案是真诚热爱!" + + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --vits-length-scale=1.0 \ + --tts-rule-fsts=./vits-zh-hf-fanchen-C/number.fst \ + --output-filename="./numbers.wav" \ + "小米有14岁了" + + sherpa-onnx-offline-tts \ + --sid=100 \ + --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --vits-length-scale=1.0 \ + --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/number.fst \ + --output-filename="./numbers-100.wav" \ + "有困难,请拨打110 或者18601200909" + + sherpa-onnx-offline-tts \ + --sid=14 \ + --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --vits-length-scale=1.0 \ + --output-filename="./wo-mi-14.wav" \ + "小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。" + + sherpa-onnx-offline-tts \ + --sid=102 \ + --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --tts-rule-fsts=./vits-zh-hf-fanchen-C/number.fst \ + --vits-length-scale=1.0 \ + --output-filename="./heteronym-102.wav" \ + "35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。1天前莅临我行指导工作。" + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
value-2x.wav + + + 小米的核心价值观是什么?答案是真诚热爱! +
numbers.wav + + + 小米有14岁了 +
numbers-100.wav + + + 有困难,请拨打110 或者18601200909 +
wo-mi-14.wav + + + 小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。 +
heteronym-102.wav + + + 35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。1天前莅临我行指导工作。 +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与 温柔." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 4.306 | 2.451 | 1.846 | 1.600 | + +-------------+-------+-------+-------+-------+ + +.. _vits-zh-hf-fanchen-wnj: + +csukuangfj/vits-zh-hf-fanchen-wnj (Chinese, 1 male) +--------------------------------------------------- + +You can download the model using the following commands:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-hf-fanchen-wnj.tar.bz2 + tar xvf vits-zh-hf-fanchen-wnj.tar.bz2 + rm vits-zh-hf-fanchen-wnj.tar.bz2 + +.. hint:: + + This model is converted from + ``_ + +.. code-block:: bash + + # information about model files + total 594760 + -rw-r--r-- 1 fangjun staff 58K Apr 21 13:40 date.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 20:42 dict + -rwxr-xr-x 1 fangjun staff 3.9K Apr 21 13:40 export-onnx-zh-hf-fanchen-models.py + -rwxr-xr-x 1 fangjun staff 2.4K Apr 21 13:40 generate-lexicon-zh-hf-fanchen-models.py + -rw-r--r-- 1 fangjun staff 2.3M Apr 21 13:40 lexicon.txt + -rw-r--r-- 1 fangjun staff 21K Apr 21 13:40 new_heteronym.fst + -rw-r--r-- 1 fangjun staff 63K Apr 21 13:40 number.fst + -rw-r--r-- 1 fangjun staff 87K Apr 21 13:40 phone.fst + -rw-r--r-- 1 fangjun staff 172M Apr 21 13:40 rule.far + -rw-r--r-- 1 fangjun staff 331B Apr 21 13:40 tokens.txt + -rwxr-xr-x 1 fangjun staff 1.9K Apr 21 13:40 vits-zh-hf-fanchen-models.sh + -rw-r--r-- 1 fangjun staff 115M Apr 21 13:40 vits-zh-hf-fanchen-wnj.onnx + +**usage**: + +.. code-block:: bash + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-fanchen-wnj/vits-zh-hf-fanchen-wnj.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-wnj/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-wnj/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-wnj/tokens.txt \ + --output-filename="./kuayue.wav" \ + "升级人车家全生态,小米迎跨越时刻。" + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-fanchen-wnj/vits-zh-hf-fanchen-wnj.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-wnj/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-wnj/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-wnj/tokens.txt \ + --tts-rule-fsts=./vits-zh-hf-fanchen-wnj/number.fst \ + --output-filename="./os.wav" \ + "这一全新操作系统,是小米14年来技术积淀的结晶。" + +.. raw:: html + + + + + + + + + + + + + + + + + +
Wave filenameContentText
kuayue.wav + + + 升级人车家全生态,小米迎跨越时刻。 +
os.wav + + + 这一全新操作系统,是小米14年来技术积淀的结晶。 +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-zh-hf-fanchen-wnj/vits-zh-hf-fanchen-wnj.onnx \ + --vits-dict-dir=./vits-zh-hf-fanchen-wnj/dict \ + --vits-lexicon=./vits-zh-hf-fanchen-wnj/lexicon.txt \ + --vits-tokens=./vits-zh-hf-fanchen-wnj/tokens.txt \ + "当夜幕 降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的 奇迹与温柔." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 4.276 | 2.505 | 1.827 | 1.608 | + +-------------+-------+-------+-------+-------+ + +.. _vits-zh-hf-theresa: + +csukuangfj/vits-zh-hf-theresa (Chinese, 804 speakers) +----------------------------------------------------- + +You can download the model with the following commands:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-hf-theresa.tar.bz2 + tar xvf vits-zh-hf-theresa.tar.bz2 + rm vits-zh-hf-theresa.tar.bz2 + +.. hint:: + + This model is converted from + ``_ + +.. code-block:: bash + + # information about model files + + total 596992 + -rw-r--r-- 1 fangjun staff 58K Apr 21 13:39 date.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 20:42 dict + -rw-r--r-- 1 fangjun staff 2.6M Apr 21 13:39 lexicon.txt + -rw-r--r-- 1 fangjun staff 21K Apr 21 13:39 new_heteronym.fst + -rw-r--r-- 1 fangjun staff 63K Apr 21 13:39 number.fst + -rw-r--r-- 1 fangjun staff 87K Apr 21 13:39 phone.fst + -rw-r--r-- 1 fangjun staff 172M Apr 21 13:39 rule.far + -rw-r--r-- 1 fangjun staff 116M Apr 21 13:39 theresa.onnx + -rw-r--r-- 1 fangjun staff 268B Apr 21 13:39 tokens.txt + -rwxr-xr-x 1 fangjun staff 5.3K Apr 21 13:39 vits-zh-hf-models.py + -rwxr-xr-x 1 fangjun staff 571B Apr 21 13:39 vits-zh-hf-models.sh + +**usage**: + +.. code-block:: bash + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-theresa/theresa.onnx \ + --vits-dict-dir=./vits-zh-hf-theresa/dict \ + --vits-lexicon=./vits-zh-hf-theresa/lexicon.txt \ + --vits-tokens=./vits-zh-hf-theresa/tokens.txt \ + --sid=0 \ + --output-filename="./reai-0.wav" \ + "真诚就是不欺人也不自欺。热爱就是全心投入并享受其中。" + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-theresa/theresa.onnx \ + --vits-dict-dir=./vits-zh-hf-theresa/dict \ + --vits-lexicon=./vits-zh-hf-theresa/lexicon.txt \ + --vits-tokens=./vits-zh-hf-theresa/tokens.txt \ + --tts-rule-fsts=./vits-zh-hf-theresa/number.fst \ + --debug=1 \ + --sid=88 \ + --output-filename="./mi14-88.wav" \ + "小米14一周销量破1000000!" + +.. raw:: html + + + + + + + + + + + + + + + + + +
Wave filenameContentText
reai-0.wav + + + 真诚就是不欺人也不自欺。热爱就是全心投入并享受其中。 +
m14-88.wav + + + 小米14一周销量破1000000! +
+ + +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-zh-hf-theresa/theresa.onnx \ + --vits-dict-dir=./vits-zh-hf-theresa/dict \ + --vits-lexicon=./vits-zh-hf-theresa/lexicon.txt \ + --vits-tokens=./vits-zh-hf-theresa/tokens.txt \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与 温柔." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 6.032 | 3.448 | 2.566 | 2.210 | + +-------------+-------+-------+-------+-------+ + +.. _vits-zh-hf-eula: + +csukuangfj/vits-zh-hf-eula (Chinese, 804 speakers) +-------------------------------------------------- + +You can download the model using the following commands:: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-hf-eula.tar.bz2 + tar xvf vits-zh-hf-eula.tar.bz2 + rm vits-zh-hf-eula.tar.bz2 + +.. hint:: + + This model is converted from + ``_ + +.. code-block:: bash + + # information about model files + + total 596992 + -rw-r--r-- 1 fangjun staff 58K Apr 21 13:39 date.fst + drwxr-xr-x 9 fangjun staff 288B Apr 19 20:42 dict + -rw-r--r-- 1 fangjun staff 116M Apr 21 13:39 eula.onnx + -rw-r--r-- 1 fangjun staff 2.6M Apr 21 13:39 lexicon.txt + -rw-r--r-- 1 fangjun staff 21K Apr 21 13:39 new_heteronym.fst + -rw-r--r-- 1 fangjun staff 63K Apr 21 13:39 number.fst + -rw-r--r-- 1 fangjun staff 87K Apr 21 13:39 phone.fst + -rw-r--r-- 1 fangjun staff 172M Apr 21 13:39 rule.far + -rw-r--r-- 1 fangjun staff 268B Apr 21 13:39 tokens.txt + -rwxr-xr-x 1 fangjun staff 5.3K Apr 21 13:39 vits-zh-hf-models.py + -rwxr-xr-x 1 fangjun staff 571B Apr 21 13:39 vits-zh-hf-models.sh + + +**usage**: + +.. code-block:: bash + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-eula/eula.onnx \ + --vits-dict-dir=./vits-zh-hf-eula/dict \ + --vits-lexicon=./vits-zh-hf-eula/lexicon.txt \ + --vits-tokens=./vits-zh-hf-eula/tokens.txt \ + --debug=1 \ + --sid=666 \ + --output-filename="./news-666.wav" \ + "小米在今天上午举办的核心干部大会上,公布了新十年的奋斗目标和科技战略,并发布了小米价值观的八条诠释。" + + sherpa-onnx-offline-tts \ + --vits-model=./vits-zh-hf-eula/eula.onnx \ + --vits-dict-dir=./vits-zh-hf-eula/dict \ + --vits-lexicon=./vits-zh-hf-eula/lexicon.txt \ + --vits-tokens=./vits-zh-hf-eula/tokens.txt \ + --tts-rule-fsts=./vits-zh-hf-eula/number.fst \ + --sid=99 \ + --output-filename="./news-99.wav" \ + "9月25日消息,雷军今日在微博发文称" + +.. raw:: html + + + + + + + + + + + + + + + + + +
Wave filenameContentText
news-666.wav + + + 小米在今天上午举办的核心干部大会上,公布了新十年的奋斗目标和科技战略,并发布了小米价值观的八条诠释。 +
news-99.wav + + + 9月25日消息,雷军今日在微博发文称 +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-zh-hf-eula/eula.onnx \ + --vits-dict-dir=./vits-zh-hf-eula/dict \ + --vits-lexicon=./vits-zh-hf-eula/lexicon.txt \ + --vits-tokens=./vits-zh-hf-eula/tokens.txt \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与 温柔." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 6.011 | 3.473 | 2.537 | 2.231 | + +-------------+-------+-------+-------+-------+ + +.. _vits-model-aishell3: + +aishell3 (Chinese, multi-speaker, 174 speakers) +----------------------------------------------- + +This model is trained on the `aishell3`_ dataset using `icefall`_. + +It supports only Chinese and it's a multi-speaker model and contains 174 speakers. + +.. hint:: + + You can download the Android APK for this model at + + ``_ + + (Please search for ``vits-icefall-zh-aishell3`` in the above Android APK page) + +.. note:: + + If you are interested in how the model is converted, please see + the documentation of `icefall`_. + + If you are interested in training your own model, please also refer to + `icefall`_. + + `icefall`_ is also developed by us. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 + tar xvf vits-icefall-zh-aishell3.tar.bz2 + rm vits-icefall-zh-aishell3.tar.bz2 + +Please check that the file sizes of the pre-trained models are correct. See +the file sizes of ``*.onnx`` files below. + +.. code-block:: bash + + vits-icefall-zh-aishell3 fangjun$ ls -lh *.onnx + -rw-r--r-- 1 fangjun staff 29M Mar 20 22:50 model.onnx + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since there are 174 speakers available, we can choose a speaker from 0 to 173. +The default speaker ID is 0. + +We use speaker ID 10, 33, and 99 below to generate audio for the same text. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=10 \ + --output-filename=./liliana-10.wav \ + "林美丽最美丽、最漂亮、最可爱!" + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=33 \ + --output-filename=./liliana-33.wav \ + "林美丽最美丽、最漂亮、最可爱!" + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=99 \ + --output-filename=./liliana-99.wav \ + "林美丽最美丽、最漂亮、最可爱!" + +It will generate 3 files: ``liliana-10.wav``, ``liliana-33.wav``, and ``liliana-99.wav``. + +We also support rule-based text normalization, which is implemented with `OpenFst`_. +Currently, only number normalization is supported. + +.. hint:: + + We will support other normalization rules later. + +The following is an example: + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=66 \ + --output-filename=./rule-66.wav \ + "35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。1天前莅临我行指导工作。" + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
liliana-10.wav + + + 林美丽最美丽、最漂亮、最可爱! +
liliana-33.wav + + + 林美丽最美丽、最漂亮、最可爱! +
liliana-99.wav + + + 林美丽最美丽、最漂亮、最可爱! +
rule-66.wav + + + 35年前,他于长沙出生, 在长白山长大。9年前他当上了银行的领导,主管行政。1天前莅临我行指导工作。 +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use speaker ID 21, 41, and 45 below to generate audio for different transcripts. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=21 \ + --output-filename=./liubei-21.wav \ + "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。" + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=41 \ + --output-filename=./demokelite-41.wav \ + "要留心,即使当你独自一人时,也不要说坏话或做坏事,而要学得在你自己面前比在别人面前更知耻。" + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=45 \ + --output-filename=./zhugeliang-45.wav \ + "夫君子之行,静以修身,俭以养德,非淡泊无以明志,非宁静无以致远。" + + +It will generate 3 files: ``liubei-21.wav``, ``demokelite-41.wav``, and ``zhugeliang-45.wav``. + +The Python script also supports rule-based text normalization. + +.. code-block:: bash + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ + --sid=103 \ + --output-filename=./rule-103.wav \ + "根据第7次全国人口普查结果表明,我国总人口有1443497378人。普查登记的大陆31个省、自治区、直辖市和现役军人的人口共1411778724人。电话号码是110。手机号是13812345678" + +.. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Wave filenameContentText
liube-21.wav + + + 勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。 +
demokelite-41.wav + + + 要留心,即使当你独自一人时,也不要说坏话或做坏事,而要学得在你自己面前比在别人面前更知耻。 +
zhugeliang-45.wav + + + 夫君子之行,静以修身,俭以养德,非淡泊无以明志,非宁静无以致远。 +
rule-103.wav + + + 根据第7次全国人口普查结果表明,我国总人口有1443497378人。普查登记的大陆31个省、自治区、直辖市和现役军人的人口共1411778724人。电话号码是110。手机号是13812345678 +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-icefall-zh-aishell3/model.onnx \ + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --sid=66 \ + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." + done + + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 0.365 | 0.220 | 0.171 | 0.156 | + +-------------+-------+-------+-------+-------+ + +.. _vits-model-en_US-lessac-medium: + +en_US-lessac-medium (English, single-speaker) +--------------------------------------------- + +This model is converted from ``_. + +The dataset used to train the model is `lessac_blizzard2013`_. + +.. hint:: + + The model is from `piper`_. + +In the following, we describe how to download it and use it with `sherpa-onnx`_. + +Download the model +~~~~~~~~~~~~~~~~~~ + +Please use the following commands to download it. + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2 + tar xf vits-piper-en_US-lessac-medium.tar.bz2 + +.. hint:: + + You can find a lot of pre-trained models for over 40 languages at + ``. + +Generate speech with executables compiled from C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts \ + --vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \ + --vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \ + --vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \ + --output-filename=./liliana-piper-en_US-lessac-medium.wav \ + "liliana, the most beautiful and lovely assistant of our team!" + +.. hint:: + + You can also use + + .. code-block:: bash + + cd /path/to/sherpa-onnx + + ./build/bin/sherpa-onnx-offline-tts-play \ + --vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \ + --vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \ + --vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \ + --output-filename=./liliana-piper-en_US-lessac-medium.wav \ + "liliana, the most beautiful and lovely assistant of our team!" + + which will play the audio as it is generating. + + +After running, it will generate a file ``liliana-piper.wav`` in the current directory. + +.. code-block:: bash + + soxi ./liliana-piper-en_US-lessac-medium.wav + + Input File : './liliana-piper-en_US-lessac-medium.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.48 = 76800 samples ~ 261.224 CDDA sectors + File Size : 154k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
liliana-piper-en_US-lessac-medium.wav + + + liliana, the most beautiful and lovely assistant of our team! +
+ +Generate speech with Python script +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts.py \ + --vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \ + --vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \ + --vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \ + --output-filename=./armstrong-piper-en_US-lessac-medium.wav \ + "That's one small step for a man, a giant leap for mankind." + +.. hint:: + + You can also use + + .. code-block:: bash + + cd /path/to/sherpa-onnx + + python3 ./python-api-examples/offline-tts-play.py \ + --vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \ + --vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \ + --vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \ + --output-filename=./armstrong-piper-en_US-lessac-medium.wav \ + "That's one small step for a man, a giant leap for mankind." + + which will play the audio as it is generating. + +After running, it will generate a file ``armstrong-piper-en_US-lessac-medium.wav`` in the current directory. + +.. code-block:: bash + + soxi ./armstrong-piper-en_US-lessac-medium.wav + + Input File : './armstrong-piper-en_US-lessac-medium.wav' + Channels : 1 + Sample Rate : 22050 + Precision : 16-bit + Duration : 00:00:03.74 = 82432 samples ~ 280.381 CDDA sectors + File Size : 165k + Bit Rate : 353k + Sample Encoding: 16-bit Signed Integer PCM + +.. raw:: html + + + + + + + + + + + + +
Wave filenameContentText
armstrong-piper-en_US-lessac-medium.wav + + + That's one small step for a man, a giant leap for mankind. +
+ +RTF on Raspberry Pi 4 Model B Rev 1.5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the following command to test the RTF of this model on Raspberry Pi 4 Model B Rev 1.5: + +.. code-block:: bash + + for t in 1 2 3 4; do + ./build/bin/sherpa-onnx-offline-tts \ + --num-threads=$t \ + --vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \ + --vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \ + --vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + done + +The results are given below: + + +-------------+-------+-------+-------+-------+ + | num_threads | 1 | 2 | 3 | 4 | + +=============+=======+=======+=======+=======+ + | RTF | 0.774 | 0.482 | 0.390 | 0.357 | + +-------------+-------+-------+-------+-------+ diff --git a/docs/source/onnx/tts/wasm/build.rst b/docs/source/onnx/tts/wasm/build.rst new file mode 100644 index 000000000..9f1d32c84 --- /dev/null +++ b/docs/source/onnx/tts/wasm/build.rst @@ -0,0 +1,99 @@ +Build +===== + +After installing `emscripten`_, we can build text-to-speech from +`sherpa-onnx`_ for `WebAssembly`_ now. + +Please use the following command to build it: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + cd wasm/tts/assets + + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 + tar xf vits-piper-en_US-libritts_r-medium.tar.bz2 + rm vits-piper-en_US-libritts_r-medium.tar.bz2 + mv vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx ./model.onnx + mv vits-piper-en_US-libritts_r-medium/tokens.txt ./ + mv vits-piper-en_US-libritts_r-medium/espeak-ng-data ./ + rm -rf vits-piper-en_US-libritts_r-medium + + cd ../../.. + + ./build-wasm-simd-tts.sh + +.. hint:: + + You can visit ``_ + to download a different model. + +After building, you should see the following output: + +.. code-block:: bash + + Install the project... + -- Install configuration: "Release" + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libkaldi-native-fbank-core.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libkaldi-decoder-core.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libsherpa-onnx-kaldifst-core.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libsherpa-onnx-fst.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libonnxruntime.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libespeak-ng.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libucd.a + -- Up-to-date: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libucd.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libpiper_phonemize.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/./sherpa-onnx.pc + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/pkgconfig/espeak-ng.pc + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/share/vim/vimfiles/ftdetect + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/share/vim/vimfiles/ftdetect/espeakfiletype.vim + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/share/vim/vimfiles/syntax + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/share/vim/vimfiles/syntax/espeakrules.vim + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/share/vim/vimfiles/syntax/espeaklist.vim + -- Up-to-date: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libucd.a + -- Up-to-date: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libespeak-ng.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libsherpa-onnx-core.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/lib/libsherpa-onnx-c-api.a + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/include/sherpa-onnx/c-api/c-api.h + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/sherpa-onnx-wasm-main.js + -- Up-to-date: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/sherpa-onnx-wasm-main.js + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/index.html + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/sherpa-onnx.js + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/app.js + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/sherpa-onnx-wasm-main.wasm + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-tts/install/bin/wasm/tts/sherpa-onnx-wasm-main.data + + ls -lh install/bin/wasm/tts + total 211248 + -rw-r--r-- 1 fangjun staff 5.3K Feb 22 09:18 app.js + -rw-r--r-- 1 fangjun staff 1.3K Feb 22 09:18 index.html + -rw-r--r-- 1 fangjun staff 92M Feb 22 10:35 sherpa-onnx-wasm-main.data + -rw-r--r-- 1 fangjun staff 117K Feb 22 10:39 sherpa-onnx-wasm-main.js + -rw-r--r-- 1 fangjun staff 11M Feb 22 10:39 sherpa-onnx-wasm-main.wasm + -rw-r--r-- 1 fangjun staff 4.5K Feb 22 09:18 sherpa-onnx.js + +Now you can use the following command to run it: + +.. code-block:: bash + + cd build-wasm-simd-tts/install/bin/wasm/tts + python3 -m http.server 6008 + +Start your browser and visit ``_; you should see the following +page: + +.. figure:: ./pic/wasm-sherpa-onnx-tts-1.png + :alt: start page of wasm + :width: 800 + +Now you can enter some text and click ``Generate`` + +A screenshot is given below: + +.. figure:: ./pic/wasm-sherpa-onnx-tts-2.png + :alt: tts result + :width: 800 + +Congratulations! You have successfully run text-to-speech with `WebAssembly`_ +in your browser. diff --git a/docs/source/onnx/tts/wasm/hf-spaces.rst b/docs/source/onnx/tts/wasm/hf-spaces.rst new file mode 100644 index 000000000..26ef61c7f --- /dev/null +++ b/docs/source/onnx/tts/wasm/hf-spaces.rst @@ -0,0 +1,49 @@ +.. _try sherpa onnx wasm with huggingface: + +Huggingface Spaces (WebAssembly) +================================ + +We provide two `Huggingface`_ spaces so that you can try text-to-speech +with `WebAssembly`_ in your browser. + +English TTS +----------- + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-tts-en.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en + +.. note:: + + The script for building this space can be found at + ``_ + +German TTS +---------- + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-tts-de.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de + +.. note:: + + The script for building this space can be found at + ``_ diff --git a/docs/source/onnx/tts/wasm/index.rst b/docs/source/onnx/tts/wasm/index.rst new file mode 100644 index 000000000..f6197a344 --- /dev/null +++ b/docs/source/onnx/tts/wasm/index.rst @@ -0,0 +1,26 @@ +WebAssembly +=========== + +In this section, we describe how to build text-to-speech from `sherpa-onnx`_ for `WebAssembly`_ +so that you can run text-to-speech with `WebAssembly`_. + +Please follow the steps below to build and run `sherpa-onnx`_ for `WebAssembly`_. + +.. hint:: + + We provide a colab notebook + |build sherpa-onnx WebAssembly TTS colab| + for you to try this section step by step. + + If you are using Windows or you don't want to setup your local environment + to build WebAssembly support, please use the above colab notebook. + +.. |build sherpa-onnx WebAssembly TTS colab| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_wasm_tts.ipynb + +.. toctree:: + :maxdepth: 3 + + ./install-emscripten.rst + ./build.rst + ./hf-spaces.rst diff --git a/docs/source/onnx/tts/wasm/install-emscripten.rst b/docs/source/onnx/tts/wasm/install-emscripten.rst new file mode 100644 index 000000000..dcd573e07 --- /dev/null +++ b/docs/source/onnx/tts/wasm/install-emscripten.rst @@ -0,0 +1,45 @@ +Install Emscripten +================== + +We need to compile the C/C++ files in `sherpa-onnx`_ with the help of +`emscripten`_. + +Please refer to ``_ +for detailed installation instructions. + +The following is an example to show you how to install it on Linux/macOS. + +.. code-block:: bash + + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + git pull + + # Hint: Please use a version <= 3.1.64 + # Other versions, like 3.1.70, are known not to work + + ./emsdk install 3.1.48 + ./emsdk activate 3.1.48 + + source ./emsdk_env.sh + +To check that you have installed `emscripten`_ successfully, please run: + +.. code-block:: bash + + emcc -v + +The above command should print something like below: + +.. code-block:: + + emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.48 (e967e20b4727956a30592165a3c1cde5c67fa0a8) + shared:INFO: (Emscripten: Running sanity checks) + (py38) fangjuns-MacBook-Pro:open-source fangjun$ emcc -v + emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.48 (e967e20b4727956a30592165a3c1cde5c67fa0a8) + clang version 18.0.0 (https://github.com/llvm/llvm-project a54545ba6514802178cf7cf1c1dd9f7efbf3cde7) + Target: wasm32-unknown-emscripten + Thread model: posix + InstalledDir: /Users/fangjun/open-source/emsdk/upstream/bin + +Congratulations! You have successfully installed `emscripten`_. diff --git a/docs/source/onnx/tts/wasm/pic/wasm-hf-tts-de.png b/docs/source/onnx/tts/wasm/pic/wasm-hf-tts-de.png new file mode 100644 index 000000000..0394e3131 Binary files /dev/null and b/docs/source/onnx/tts/wasm/pic/wasm-hf-tts-de.png differ diff --git a/docs/source/onnx/tts/wasm/pic/wasm-hf-tts-en.png b/docs/source/onnx/tts/wasm/pic/wasm-hf-tts-en.png new file mode 100644 index 000000000..f0279c218 Binary files /dev/null and b/docs/source/onnx/tts/wasm/pic/wasm-hf-tts-en.png differ diff --git a/docs/source/onnx/tts/wasm/pic/wasm-sherpa-onnx-tts-1.png b/docs/source/onnx/tts/wasm/pic/wasm-sherpa-onnx-tts-1.png new file mode 100644 index 000000000..14d5bd396 Binary files /dev/null and b/docs/source/onnx/tts/wasm/pic/wasm-sherpa-onnx-tts-1.png differ diff --git a/docs/source/onnx/tts/wasm/pic/wasm-sherpa-onnx-tts-2.png b/docs/source/onnx/tts/wasm/pic/wasm-sherpa-onnx-tts-2.png new file mode 100644 index 000000000..735f20cd9 Binary files /dev/null and b/docs/source/onnx/tts/wasm/pic/wasm-sherpa-onnx-tts-2.png differ diff --git a/docs/source/onnx/tutorials/cn.rst b/docs/source/onnx/tutorials/cn.rst new file mode 100644 index 000000000..132c73f6d --- /dev/null +++ b/docs/source/onnx/tutorials/cn.rst @@ -0,0 +1,60 @@ +中文资料 (Chinese tutorials) +============================ +2024-10-09【基于sherpa的本地智能语音助手入门-Java Api版 +-------------------------------------------------------------------------------------------------------------- + +详细地址为 ``_ + +面向java开发者:使用sherpa-onnx来实现一个本地智能语音助手。它将支持流式的关键词唤醒和语音识别、文本转语音、热词等,且整个过程中无需互联网,可以没有GPU,适合部署在边缘侧/用户侧设备上。 + +2024-07-03【🆓 語音辨識引擎sherpa-onnx CPU上篇】讓您輕鬆體驗語音辨識功能(Docker架設) +-------------------------------------------------------------------------------------------------------------- + +详细地址为 ``_ + +使用了繁体字。有用 docker。 + + +2024-06-10 SherpaOnnxTtsEngine - Android 本地 TTS 语言转文本引擎 +------------------------------------------------------------------------------------------ + +详细地址为 ``_ + +2024-06-10 用LLM搭建100个应用(一):从0到1搭建自己的Windows贾维斯(1) +-------------------------------------------------------------------------------------------- + +详细地址为 ``_ + +演示了如何使用 Python API 结合 LLM。 + +2024-05-09 记录一下sherpa-onnx的安装及使用 +------------------------------------------ + +详细地址为 ``_ + +演示了如何在 Windows 上使用及如何解决乱码问题。 + + +2024-04-09 rv1106&rv1109&rv1126移植sherpa-onnx 实现离线TTS功能 +-------------------------------------------------------------- + +详细地址为 ``_. + +介绍了如何使用高版本的 gcc 编译 32-bit arm 的 sherpa-onnx, 然后在 +在老系统的 arm 开发板上运行。 + + +2023-08-08 snowboy+新一代kaldi(k2-fsa)sherpa-onnx实现离线语音识别【语音助手】 +------------------------------------------------------------------------------------- + +详细地址为 ``_. + +使用了 Python API。 + + +2023-03-16 k2语音识别:如何使用sherpa-onnx +------------------------------------------ + +详细地址为 ``_. + + diff --git a/docs/source/onnx/tutorials/index.rst b/docs/source/onnx/tutorials/index.rst new file mode 100644 index 000000000..68a4b8278 --- /dev/null +++ b/docs/source/onnx/tutorials/index.rst @@ -0,0 +1,14 @@ +Tutorials +========= + +This page contains links to tutorials written by our users. + +.. caution:: + + The tutorials are not necessarily written in English. + + +.. toctree:: + :maxdepth: 2 + + ./cn.rst diff --git a/docs/source/onnx/vad/index.rst b/docs/source/onnx/vad/index.rst new file mode 100644 index 000000000..2c469dee8 --- /dev/null +++ b/docs/source/onnx/vad/index.rst @@ -0,0 +1,14 @@ +VAD +=== + +We support `silero-vad`_ for voice activity detection. + +You can find pre-built Android APKs for VAD at: + + ``_ + + +APKs for VAD + speech recognition can be found at: + + ``_ + diff --git a/docs/source/onnx/wasm/build.rst b/docs/source/onnx/wasm/build.rst new file mode 100644 index 000000000..73ad04934 --- /dev/null +++ b/docs/source/onnx/wasm/build.rst @@ -0,0 +1,89 @@ +Build +===== + +After installing `emscripten`_, we can build `sherpa-onnx`_ for `WebAssembly`_ now. + +Please use the following command to build it: + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa-onnx + cd sherpa-onnx + + cd wasm/asr/assets + + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + + # Note it is not an error that we rename encoder.int8.onnx to encoder.onnx + + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx encoder.onnx + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx decoder.onnx + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx joiner.onnx + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ./ + rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/ + + cd ../../.. + + ./build-wasm-simd-asr.sh + +.. hint:: + + You can visit ``_ + to download a different model. + + If you want to use a streaming Paraformer model, please see + ``_ + +After building, you should see the following output: + +.. code-block:: bash + + Install the project... + -- Install configuration: "Release" + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/sherpa-onnx-wasm-asr-main.js + -- Up-to-date: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/sherpa-onnx-wasm-asr-main.js + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/index.html + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/sherpa-onnx.js + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/app.js + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/sherpa-onnx-wasm-asr-main.wasm + -- Installing: /Users/fangjun/open-source/sherpa-onnx/build-wasm-simd-asr/install/bin/wasm/asr/sherpa-onnx-wasm-asr-main.data + + ls -lh install/bin/wasm/asr + total 440080 + -rw-r--r-- 1 fangjun staff 9.0K Feb 23 17:39 app.js + -rw-r--r-- 1 fangjun staff 978B Feb 23 17:39 index.html + -rw-r--r-- 1 fangjun staff 199M Feb 23 18:34 sherpa-onnx-wasm-asr-main.data + -rw-r--r-- 1 fangjun staff 90K Feb 23 18:38 sherpa-onnx-wasm-asr-main.js + -rw-r--r-- 1 fangjun staff 10M Feb 23 18:38 sherpa-onnx-wasm-asr-main.wasm + -rw-r--r-- 1 fangjun staff 9.1K Feb 23 17:39 sherpa-onnx.js + +Now you can use the following command to run it: + +.. code-block:: bash + + cd build-wasm-simd-asr/install/bin/wasm/asr + python3 -m http.server 6006 + +Start your browser and visit ``_; you should see the following +page: + +.. figure:: ./pic/wasm-asr-sherpa-onnx-1.png + :alt: start page of wasm + :width: 800 + +Now click start and speak! You should see the recognition results in the text box. + +.. warning:: + + We are using a bilingual model (Chinese + English) in the above example, which means + you can only speak Chinese or English in this case. + +A screenshot is given below: + +.. figure:: ./pic/wasm-asr-sherpa-onnx-2.png + :alt: recognition result + :width: 800 + +Congratulations! You have successfully run real-time speech recognition with `WebAssembly`_ +in your browser. diff --git a/docs/source/onnx/wasm/hf-spaces.rst b/docs/source/onnx/wasm/hf-spaces.rst new file mode 100644 index 000000000..96077691d --- /dev/null +++ b/docs/source/onnx/wasm/hf-spaces.rst @@ -0,0 +1,92 @@ +.. _try sherpa ncnn wasm with huggingface: + +Huggingface Spaces (WebAssembly) +================================ + +We provide four `Huggingface`_ spaces so that you can try real-time +speech recognition with `WebAssembly`_ in your browser. + +English only (Zipformer) +------------------------ + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-en.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en + +.. note:: + + The script for building this space can be found at + ``_ + + +Chinese + English (Zipformer) +----------------------------- + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-zh-en-zipformer.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en + +.. note:: + + The script for building this space can be found at + ``_ + +Chinese + English (Paraformer) +------------------------------ + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-zh-en-paraformer.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer + +.. note:: + + The script for building this space can be found at + ``_ + +Chinese + English + Cantonese (Paraformer) +------------------------------------------ + +``_ + +.. hint:: + + If you don't have access to `Huggingface`_, please visit the following mirror: + + ``_ + +.. figure:: ./pic/wasm-hf-zh-yue-en-paraformer.png + :alt: start page of wasm + :width: 800 + :target: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer + +.. note:: + + The script for building this space can be found at + ``_ diff --git a/docs/source/onnx/wasm/index.rst b/docs/source/onnx/wasm/index.rst new file mode 100644 index 000000000..aef6c736a --- /dev/null +++ b/docs/source/onnx/wasm/index.rst @@ -0,0 +1,28 @@ +.. _sherpa-onnx-wasm: + +WebAssembly +=========== + +In this section, we describe how to build `sherpa-onnx`_ for `WebAssembly`_ +so that you can run real-time speech recognition with `WebAssembly`_. + +Please follow the steps below to build and run `sherpa-onnx`_ for `WebAssembly`_. + +.. hint:: + + We provide a colab notebook + |build sherpa-onnx WebAssembly ASR colab| + for you to try this section step by step. + + If you are using Windows or you don't want to setup your local environment + to build WebAssembly support, please use the above colab notebook. + +.. |build sherpa-onnx WebAssembly ASR colab| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/sherpa_onnx_wasm_asr.ipynb + +.. toctree:: + :maxdepth: 3 + + ./install-emscripten.rst + ./build.rst + ./hf-spaces.rst diff --git a/docs/source/onnx/wasm/install-emscripten.rst b/docs/source/onnx/wasm/install-emscripten.rst new file mode 100644 index 000000000..dcd573e07 --- /dev/null +++ b/docs/source/onnx/wasm/install-emscripten.rst @@ -0,0 +1,45 @@ +Install Emscripten +================== + +We need to compile the C/C++ files in `sherpa-onnx`_ with the help of +`emscripten`_. + +Please refer to ``_ +for detailed installation instructions. + +The following is an example to show you how to install it on Linux/macOS. + +.. code-block:: bash + + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + git pull + + # Hint: Please use a version <= 3.1.64 + # Other versions, like 3.1.70, are known not to work + + ./emsdk install 3.1.48 + ./emsdk activate 3.1.48 + + source ./emsdk_env.sh + +To check that you have installed `emscripten`_ successfully, please run: + +.. code-block:: bash + + emcc -v + +The above command should print something like below: + +.. code-block:: + + emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.48 (e967e20b4727956a30592165a3c1cde5c67fa0a8) + shared:INFO: (Emscripten: Running sanity checks) + (py38) fangjuns-MacBook-Pro:open-source fangjun$ emcc -v + emcc (Emscripten gcc/clang-like replacement + linker emulating GNU ld) 3.1.48 (e967e20b4727956a30592165a3c1cde5c67fa0a8) + clang version 18.0.0 (https://github.com/llvm/llvm-project a54545ba6514802178cf7cf1c1dd9f7efbf3cde7) + Target: wasm32-unknown-emscripten + Thread model: posix + InstalledDir: /Users/fangjun/open-source/emsdk/upstream/bin + +Congratulations! You have successfully installed `emscripten`_. diff --git a/docs/source/onnx/wasm/pic/wasm-asr-sherpa-onnx-1.png b/docs/source/onnx/wasm/pic/wasm-asr-sherpa-onnx-1.png new file mode 100644 index 000000000..2069386ad Binary files /dev/null and b/docs/source/onnx/wasm/pic/wasm-asr-sherpa-onnx-1.png differ diff --git a/docs/source/onnx/wasm/pic/wasm-asr-sherpa-onnx-2.png b/docs/source/onnx/wasm/pic/wasm-asr-sherpa-onnx-2.png new file mode 100644 index 000000000..0c5fac82f Binary files /dev/null and b/docs/source/onnx/wasm/pic/wasm-asr-sherpa-onnx-2.png differ diff --git a/docs/source/onnx/wasm/pic/wasm-hf-en.png b/docs/source/onnx/wasm/pic/wasm-hf-en.png new file mode 100644 index 000000000..29d7790b4 Binary files /dev/null and b/docs/source/onnx/wasm/pic/wasm-hf-en.png differ diff --git a/docs/source/onnx/wasm/pic/wasm-hf-zh-en-paraformer.png b/docs/source/onnx/wasm/pic/wasm-hf-zh-en-paraformer.png new file mode 100644 index 000000000..4706da1a0 Binary files /dev/null and b/docs/source/onnx/wasm/pic/wasm-hf-zh-en-paraformer.png differ diff --git a/docs/source/onnx/wasm/pic/wasm-hf-zh-en-zipformer.png b/docs/source/onnx/wasm/pic/wasm-hf-zh-en-zipformer.png new file mode 100644 index 000000000..54d281cb6 Binary files /dev/null and b/docs/source/onnx/wasm/pic/wasm-hf-zh-en-zipformer.png differ diff --git a/docs/source/onnx/wasm/pic/wasm-hf-zh-yue-en-paraformer.png b/docs/source/onnx/wasm/pic/wasm-hf-zh-yue-en-paraformer.png new file mode 100644 index 000000000..ea53103a9 Binary files /dev/null and b/docs/source/onnx/wasm/pic/wasm-hf-zh-yue-en-paraformer.png differ diff --git a/docs/source/onnx/websocket/code/python-online-websocket-client-decode-a-file.txt b/docs/source/onnx/websocket/code/python-online-websocket-client-decode-a-file.txt new file mode 100644 index 000000000..b82eb4ea1 --- /dev/null +++ b/docs/source/onnx/websocket/code/python-online-websocket-client-decode-a-file.txt @@ -0,0 +1,21 @@ +usage: online-websocket-client-decode-file.py [-h] [--server-addr SERVER_ADDR] + [--server-port SERVER_PORT] + [--samples-per-message SAMPLES_PER_MESSAGE] + [--seconds-per-message SECONDS_PER_MESSAGE] + sound_file + +positional arguments: + sound_file The input sound file. Must be wave with a single + channel, 16kHz sampling rate, 16-bit of each sample. + +optional arguments: + -h, --help show this help message and exit + --server-addr SERVER_ADDR + Address of the server (default: localhost) + --server-port SERVER_PORT + Port of the server (default: 6006) + --samples-per-message SAMPLES_PER_MESSAGE + Number of samples per message (default: 8000) + --seconds-per-message SECONDS_PER_MESSAGE + We will simulate that the duration of two messages is + of this value (default: 0.1) diff --git a/docs/source/onnx/websocket/code/sherpa-onnx-offline-websocket-server-help.txt b/docs/source/onnx/websocket/code/sherpa-onnx-offline-websocket-server-help.txt new file mode 100644 index 000000000..893e5930d --- /dev/null +++ b/docs/source/onnx/websocket/code/sherpa-onnx-offline-websocket-server-help.txt @@ -0,0 +1,55 @@ +Automatic speech recognition with sherpa-onnx using websocket. + +Usage: + +./bin/sherpa-onnx-offline-websocket-server --help + +(1) For transducer models + +./bin/sherpa-onnx-offline-websocket-server \ + --port=6006 \ + --num-work-threads=5 \ + --tokens=/path/to/tokens.txt \ + --encoder=/path/to/encoder.onnx \ + --decoder=/path/to/decoder.onnx \ + --joiner=/path/to/joiner.onnx \ + --log-file=./log.txt \ + --max-batch-size=5 + +(2) For Paraformer + +./bin/sherpa-onnx-offline-websocket-server \ + --port=6006 \ + --num-work-threads=5 \ + --tokens=/path/to/tokens.txt \ + --paraformer=/path/to/model.onnx \ + --log-file=./log.txt \ + --max-batch-size=5 + +Please refer to +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +for a list of pre-trained models to download. + +Options: + --log-file : Path to the log file. Logs are appended to this file (string, default = "./log.txt") + --max-utterance-length : Max utterance length in seconds. If we receive an utterance longer than this value, we will reject the connection. If you have enough memory, you can select a large value for it. (float, default = 300) + --decoding-method : decoding method,Valid values: greedy_search. (string, default = "greedy_search") + --num-threads : Number of threads to run the neural network (int, default = 2) + --feat-dim : Feature dimension. Must match the one expected by the model. (int, default = 80) + --port : The port on which the server will listen. (int, default = 6006) + --debug : true to print model information while loading it. (bool, default = false) + --joiner : Path to joiner.onnx (string, default = "") + --tokens : Path to tokens.txt (string, default = "") + --encoder : Path to encoder.onnx (string, default = "") + --num-work-threads : Thread pool size for for neural network computation and decoding. (int, default = 3) + --paraformer : Path to model.onnx of paraformer. (string, default = "") + --num-io-threads : Thread pool size for network connections. (int, default = 1) + --max-batch-size : Max batch size for decoding. (int, default = 5) + --decoder : Path to decoder.onnx (string, default = "") + +Standard options: + --help : Print out usage message (bool, default = false) + --print-args : Print the command line arguments (to stderr) (bool, default = true) + --config : Configuration file to read (this option may be repeated) (string, default = "") + + diff --git a/docs/source/onnx/websocket/code/sherpa-onnx-online-websocket-client-help-info.txt b/docs/source/onnx/websocket/code/sherpa-onnx-online-websocket-client-help-info.txt new file mode 100644 index 000000000..87d2a30d0 --- /dev/null +++ b/docs/source/onnx/websocket/code/sherpa-onnx-online-websocket-client-help-info.txt @@ -0,0 +1,30 @@ +[I] /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:484:int sherpa_onnx::ParseOptions::Read(int, const char *const *) ./build/bin/sherpa-onnx-online-websocket-client +[I] /Users/fangjun/open-source/sherpa-onnx/sherpa-onnx/csrc/parse-options.cc:525:void sherpa_onnx::ParseOptions::PrintUsage(bool) const + +Automatic speech recognition with sherpa-onnx using websocket. + +Usage: + +./bin/sherpa-onnx-online-websocket-client --help + +./bin/sherpa-onnx-online-websocket-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + --samples-per-message=8000 \ + --seconds-per-message=0.2 \ + /path/to/foo.wav + +It support only wave of with a single channel, 16kHz, 16-bit samples. + +Options: + --seconds-per-message : We will simulate that each message takes this number of seconds to send. If you select a very large value, it will take a long time to send all the samples (float, default = 0.2) + --samples-per-message : Send this number of samples per message. (int, default = 8000) + --sample-rate : Sample rate of the input wave. Should be the one expected by the server (int, default = 16000) + --server-port : Port of the websocket server (int, default = 6006) + --server-ip : IP address of the websocket server (string, default = "127.0.0.1") + +Standard options: + --help : Print out usage message (bool, default = false) + --print-args : Print the command line arguments (to stderr) (bool, default = true) + --config : Configuration file to read (this option may be repeated) (string, default = "") + diff --git a/docs/source/onnx/websocket/code/sherpa-onnx-online-websocket-server-help.txt b/docs/source/onnx/websocket/code/sherpa-onnx-online-websocket-server-help.txt new file mode 100644 index 000000000..f735f6ea0 --- /dev/null +++ b/docs/source/onnx/websocket/code/sherpa-onnx-online-websocket-server-help.txt @@ -0,0 +1,57 @@ + + +Automatic speech recognition with sherpa-onnx using websocket. + +Usage: + +./bin/sherpa-onnx-online-websocket-server --help + +./bin/sherpa-onnx-online-websocket-server \ + --port=6006 \ + --num-work-threads=5 \ + --tokens=/path/to/tokens.txt \ + --encoder=/path/to/encoder.onnx \ + --decoder=/path/to/decoder.onnx \ + --joiner=/path/to/joiner.onnx \ + --log-file=./log.txt \ + --max-batch-size=5 \ + --loop-interval-ms=10 + +Please refer to +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +for a list of pre-trained models to download. + +Options: + --max-batch-size : Max batch size for recognition. (int, default = 5) + --loop-interval-ms : It determines how often the decoder loop runs. (int, default = 10) + --max-active-paths : beam size used in modified beam search. (int, default = 4) + --decoding-method : decoding method,now support greedy_search and modified_beam_search. (string, default = "greedy_search") + --rule3-min-utterance-length : This endpointing rule3 requires utterance-length (in seconds) to be >= this value. (float, default = 20) + --rule3-min-trailing-silence : This endpointing rule3 requires duration of trailing silence in seconds) to be >= this value. (float, default = 0) + --rule3-must-contain-nonsilence : If True, for this endpointing rule3 to apply there must be nonsilence in the best-path traceback. For decoding, a non-blank token is considered as non-silence (bool, default = false) + --rule2-min-utterance-length : This endpointing rule2 requires utterance-length (in seconds) to be >= this value. (float, default = 0) + --rule1-min-trailing-silence : This endpointing rule1 requires duration of trailing silence in seconds) to be >= this value. (float, default = 2.4) + --feat-dim : Feature dimension. Must match the one expected by the model. (int, default = 80) + --rule1-must-contain-nonsilence : If True, for this endpointing rule1 to apply there must be nonsilence in the best-path traceback. For decoding, a non-blank token is considered as non-silence (bool, default = false) + --enable-endpoint : True to enable endpoint detection. False to disable it. (bool, default = true) + --num_threads : Number of threads to run the neural network (int, default = 2) + --debug : true to print model information while loading it. (bool, default = false) + --port : The port on which the server will listen. (int, default = 6006) + --num-io-threads : Thread pool size for network connections. (int, default = 1) + --rule2-must-contain-nonsilence : If True, for this endpointing rule2 to apply there must be nonsilence in the best-path traceback. For decoding, a non-blank token is considered as non-silence (bool, default = true) + --joiner : Path to joiner.onnx (string, default = "") + --tokens : Path to tokens.txt (string, default = "") + --num-work-threads : Thread pool size for for neural network computation and decoding. (int, default = 3) + --encoder : Path to encoder.onnx (string, default = "") + --sample-rate : Sampling rate of the input waveform. Note: You can have a different sample rate for the input waveform. We will do resampling inside the feature extractor (int, default = 16000) + --rule2-min-trailing-silence : This endpointing rule2 requires duration of trailing silence in seconds) to be >= this value. (float, default = 1.2) + --log-file : Path to the log file. Logs are appended to this file (string, default = "./log.txt") + --rule1-min-utterance-length : This endpointing rule1 requires utterance-length (in seconds) to be >= this value. (float, default = 0) + --decoder : Path to decoder.onnx (string, default = "") + +Standard options: + --config : Configuration file to read (this option may be repeated) (string, default = "") + --help : Print out usage message (bool, default = false) + --print-args : Print the command line arguments (to stderr) (bool, default = true) + + diff --git a/docs/source/onnx/websocket/index.rst b/docs/source/onnx/websocket/index.rst new file mode 100644 index 000000000..c1daee961 --- /dev/null +++ b/docs/source/onnx/websocket/index.rst @@ -0,0 +1,25 @@ +.. _sherpa-onnx-websocket: + +WebSocket +========= + +In this section, we describe how to use the `WebSocket`_ server and client +for real-time speech recognition with `sherpa-onnx`_. + +The `WebSocket`_ server is implemented in C++ with the help of +`websocketpp`_ and `asio`_. + +.. hint:: + + It does not depend on `boost`_. + + It does not depend on `boost`_. + + It does not depend on `boost`_. + + +.. toctree:: + :maxdepth: 3 + + ./online-websocket.rst + ./offline-websocket.rst diff --git a/docs/source/onnx/websocket/offline-websocket.rst b/docs/source/onnx/websocket/offline-websocket.rst new file mode 100644 index 000000000..9dddd4a12 --- /dev/null +++ b/docs/source/onnx/websocket/offline-websocket.rst @@ -0,0 +1,135 @@ +.. _onnx_non_streaming_websocket_server_and_client: + +Non-streaming WebSocket server and client +========================================= + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +Build `sherpa-onnx` with WebSocket support +------------------------------------------ + +By default, it will generate the following binaries after :ref:`install_sherpa_onnx`: + +.. code-block:: bash + + sherpa-onnx fangjun$ ls -lh build/bin/*websocket* + -rwxr-xr-x 1 fangjun staff 1.1M Mar 31 22:09 build/bin/sherpa-onnx-offline-websocket-server + -rwxr-xr-x 1 fangjun staff 1.0M Mar 31 22:09 build/bin/sherpa-onnx-online-websocket-client + -rwxr-xr-x 1 fangjun staff 1.2M Mar 31 22:09 build/bin/sherpa-onnx-online-websocket-server + +Please refer to :ref:`onnx_streaming_websocket_server_and_client` +for the usage of ``sherpa-onnx-online-websocket-server`` +and ``sherpa-onnx-online-websocket-client``. + +View the server usage +--------------------- + +Before starting the server, let us view the help message of ``sherpa-onnx-offline-websocket-server``: + +.. code-block:: bash + + build/bin/sherpa-onnx-offline-websocket-server + +The above command will print the following help information: + +.. literalinclude:: ./code/sherpa-onnx-offline-websocket-server-help.txt + +Start the server +---------------- + +.. hint:: + + Please refer to :ref:`sherpa-onnx-pre-trained-models` + for a list of pre-trained models. + +Start the server with a transducer model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline-websocket-server \ + --port=6006 \ + --num-work-threads=5 \ + --tokens=./sherpa-onnx-zipformer-en-2023-03-30/tokens.txt \ + --encoder=./sherpa-onnx-zipformer-en-2023-03-30/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-zipformer-en-2023-03-30/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-zipformer-en-2023-03-30/joiner-epoch-99-avg-1.onnx \ + --log-file=./log.txt \ + --max-batch-size=5 + +.. caution:: + + The arguments are in the form ``--key=value``. + + It does not support ``--key value``. + + It does not support ``--key value``. + + It does not support ``--key value``. + +.. hint:: + + In the above demo, the model files are + from :ref:`sherpa_onnx_zipformer_en_2023_03_30`. + +.. note:: + + Note that the server supports processing multiple clients in a batch in parallel. + You can use ``--max-batch-size`` to limit the batch size. + +Start the server with a paraformer model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ./build/bin/sherpa-onnx-offline-websocket-server \ + --port=6006 \ + --num-work-threads=5 \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx \ + --log-file=./log.txt \ + --max-batch-size=5 + +.. hint:: + + In the above demo, the model files are + from :ref:`sherpa_onnx_offline_paraformer_zh_2023_03_28_chinese`. + +Start the client (Python) +------------------------- + +We provide two clients written in Python: + + - `offline-websocket-client-decode-files-paralell.py `_: It decodes multiple files in parallel + by creating a separate connection for each file + - `offline-websocket-client-decode-files-sequential.py `_: It decodes multiple files sequentially + by creating only a single connection + +offline-websocket-client-decode-files-paralell.py +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav + +offline-websocket-client-decode-files-sequential.py +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + python3 ./python-api-examples/offline-websocket-client-decode-files-sequential.py \ + --server-addr localhost \ + --server-port 6006 \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/1.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/2.wav \ + ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/8k.wav diff --git a/docs/source/onnx/websocket/online-websocket.rst b/docs/source/onnx/websocket/online-websocket.rst new file mode 100644 index 000000000..792a1bde5 --- /dev/null +++ b/docs/source/onnx/websocket/online-websocket.rst @@ -0,0 +1,181 @@ +.. _onnx_streaming_websocket_server_and_client: + +Streaming WebSocket server and client +===================================== + +.. hint:: + + Please refer to :ref:`install_sherpa_onnx` to install `sherpa-onnx`_ + before you read this section. + +Build `sherpa-onnx` with WebSocket support +------------------------------------------ + +By default, it will generate the following binaries after :ref:`install_sherpa_onnx`: + +.. code-block:: bash + + sherpa-onnx fangjun$ ls -lh build/bin/*websocket* + -rwxr-xr-x 1 fangjun staff 1.1M Mar 31 22:09 build/bin/sherpa-onnx-offline-websocket-server + -rwxr-xr-x 1 fangjun staff 1.0M Mar 31 22:09 build/bin/sherpa-onnx-online-websocket-client + -rwxr-xr-x 1 fangjun staff 1.2M Mar 31 22:09 build/bin/sherpa-onnx-online-websocket-server + +Please refer to :ref:`onnx_non_streaming_websocket_server_and_client` +for the usage of ``sherpa-onnx-offline-websocket-server``. + +View the server usage +--------------------- + +Before starting the server, let us view the help message of ``sherpa-onnx-online-websocket-server``: + +.. code-block:: bash + + build/bin/sherpa-onnx-online-websocket-server + +The above command will print the following help information: + +.. literalinclude:: ./code/sherpa-onnx-online-websocket-server-help.txt + +Start the server +---------------- + +.. hint:: + + Please refer to :ref:`sherpa-onnx-pre-trained-models` + for a list of pre-trained models. + +.. code-block:: bash + + ./build/bin/sherpa-onnx-online-websocket-server \ + --port=6006 \ + --num-work-threads=3 \ + --num-io-threads=2 \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ + --log-file=./log.txt \ + --max-batch-size=5 \ + --loop-interval-ms=20 + +.. caution:: + + The arguments are in the form ``--key=value``. + + It does not support ``--key value``. + + It does not support ``--key value``. + + It does not support ``--key value``. + +.. hint:: + + In the above demo, the model files are + from :ref:`sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20`. + +.. note:: + + Note that the server supports processing multiple clients in a batch in parallel. + You can use ``--max-batch-size`` to limit the batch size. + +View the usage of the client (C++) +---------------------------------- + +Let us view the usage of the C++ `WebSocket`_ client: + +.. code-block:: bash + + ./build/bin/sherpa-onnx-online-websocket-client + +The above command will print the following help information: + +.. literalinclude:: ./code/sherpa-onnx-online-websocket-client-help-info.txt + +.. caution:: + + We only support using IP address for ``--server-ip``. + + For instance, please don't use ``--server-ip=localhost``, use ``--server-ip=127.0.0.1`` instead. + + For instance, please don't use ``--server-ip=localhost``, use ``--server-ip=127.0.0.1`` instead. + + For instance, please don't use ``--server-ip=localhost``, use ``--server-ip=127.0.0.1`` instead. + +Start the client (C++) +---------------------- + +To start the C++ `WebSocket`_ client, use: + +.. code-block:: bash + + build/bin/sherpa-onnx-online-websocket-client \ + --seconds-per-message=0.1 \ + --server-port=6006 \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav + +Since the server is able to process multiple clients at the same time, you can +use the following command to start multiple clients: + +.. code-block:: bash + + for i in $(seq 0 10); do + k=$(expr $i % 5) + build/bin/sherpa-onnx-online-websocket-client \ + --seconds-per-message=0.1 \ + --server-port=6006 \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/${k}.wav & + done + + wait + + echo "done" + +View the usage of the client (Python) +------------------------------------- + +Use the following command to view the usage: + +.. code-block:: bash + + python3 ./python-api-examples/online-websocket-client-decode-file.py --help + +.. hint:: + + ``online-websocket-client-decode-file.py`` is from + ``_ + +It will print: + +.. literalinclude:: ./code/python-online-websocket-client-decode-a-file.txt + +.. hint:: + + For the Python client, you can use either a domain name or an IP address + for ``--server-addr``. For instance, you can use either + ``--server-addr localhost`` or ``--server-addr 127.0.0.1``. + + For the input argument, you can either use ``--key=value`` or ``--key value``. + + +Start the client (Python) +------------------------- + +.. code-block:: bash + + python3 ./python-api-examples/online-websocket-client-decode-file.py \ + --server-addr localhost \ + --server-port 6006 \ + --seconds-per-message 0.1 \ + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/4.wav + +Start the client (Python, with microphone) +------------------------------------------ + +.. code-block:: bash + + python3 ./python-api-examples/online-websocket-client-microphone.py \ + --server-addr localhost \ + --server-port 6006 + + ``online-websocket-client-microphone.py `` is from + ``_ diff --git a/docs/source/pdf.rst b/docs/source/pdf.rst new file mode 100644 index 000000000..c6fec9f3a --- /dev/null +++ b/docs/source/pdf.rst @@ -0,0 +1,22 @@ +Download pdf +============ + +We provide a single pdf file containing all the documentation. + + .. hint:: + + All Chinese related content is not included in the pdf file. + +Please download it from the following address: + + ``_ + +.. note:: + + For Chinese users, you can use the following mirror: + + ``_ + +Please always download the latest version. + +The pdf file is updated automagically whenever the doc is changed. diff --git a/docs/source/pic/pre-trained-model/visit.png b/docs/source/pic/pre-trained-model/visit.png new file mode 100644 index 000000000..0dbc24029 Binary files /dev/null and b/docs/source/pic/pre-trained-model/visit.png differ diff --git a/docs/source/pic/pre-trained-model/visit2.png b/docs/source/pic/pre-trained-model/visit2.png new file mode 100644 index 000000000..f173e8827 Binary files /dev/null and b/docs/source/pic/pre-trained-model/visit2.png differ diff --git a/docs/source/pic/qq-group-for-next-gen-kaldi.jpg b/docs/source/pic/qq-group-for-next-gen-kaldi.jpg new file mode 100644 index 000000000..3ec4a9877 Binary files /dev/null and b/docs/source/pic/qq-group-for-next-gen-kaldi.jpg differ diff --git a/docs/source/pic/wechat-group-for-next-gen-kaldi.jpg b/docs/source/pic/wechat-group-for-next-gen-kaldi.jpg new file mode 100644 index 000000000..e11a84211 Binary files /dev/null and b/docs/source/pic/wechat-group-for-next-gen-kaldi.jpg differ diff --git a/docs/source/pretrained-models.rst b/docs/source/pretrained-models.rst new file mode 100644 index 000000000..7248c1fab --- /dev/null +++ b/docs/source/pretrained-models.rst @@ -0,0 +1,125 @@ +Pre-trained models +================== + +Pre-trained models for different projects +----------------------------------------- +.. list-table:: + + * - Project + - Pretrained models + * - `k2-fsa/sherpa`_ + - :ref:`Click here ` + * - `k2-fsa/sherpa-onnx`_ + - :ref:`Click here ` + * - `k2-fsa/sherpa-ncnn`_ + - :ref:`Click here ` + +How to download +--------------- + +We are hosting our pre-trained models on `Huggingface`_ as git repositories +managed by `Git LFS `_. + +There are at least two methods for downloading: + + - Using ``git lfs`` + - Using ``wget`` + +In the following, we use the pre-trained model :ref:`sherpa-onnx-wenetspeech-2023-06-15-streaming` +as an example. + +Using git lfs +~~~~~~~~~~~~~ + +Please first install ``git-lfs`` by following ``_. + +.. tabs:: + + .. tab:: Linux + + .. code-block:: bash + + # apt/deb + sudo apt-get install git-lfs + + # yum/rpm + sudo yum install git-lfs + + Please see ``_ + for details. + + .. tab:: macOS + + .. code-block:: bash + + brew install git-lfs + + .. tab:: Windows + + Please visit ``_ + to download and install ``git-lfs``. + +Then use the following commands to download pre-trained models: + +.. tabs:: + + .. tab:: Linux/macOS + + .. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 + cd icefall-asr-zipformer-streaming-wenetspeech-20230615 + git lfs pull --include "exp/*chunk-16-left-128.*onnx" + + .. tab:: Windows (Powershell) + + .. code-block:: bash + + $env:GIT_LFS_SKIP_SMUDGE="1" + git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 + cd icefall-asr-zipformer-streaming-wenetspeech-20230615 + git lfs pull --include "exp/*chunk-16-left-128.*onnx" + + .. tab:: Windows (cmd) + + .. code-block:: bash + + set GIT_LFS_SKIP_SMUDGE="1" + git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 + cd icefall-asr-zipformer-streaming-wenetspeech-20230615 + git lfs pull --include "exp/*chunk-16-left-128.*onnx" + +.. note:: + + It is very important to set the environment variable ``GIT_LFS_SKIP_SMUDGE`` to ``1``. + We don't recommend using ``git lfs install`` as it will download many large files that + we don't need. + + +Using wget +~~~~~~~~~~ + +First, let us visit the huggingface `git repository `_ of the pre-trained model: + +.. image:: pic/pre-trained-model/visit.png + :width: 600 + :align: center + :alt: Visit the pre-traind model git repo + +Click ``Files and versions`` and navigate to the directory containing files +for downloading: + +.. image:: pic/pre-trained-model/visit2.png + :width: 600 + :align: center + :alt: Get the URL for downloading + +Right click the arrow that indicates downloading and copy the link address. +After that, you can use, for instance, ``wget`` to download the file with the following +command: + +.. code-block:: bash + + wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx + +Repeat the process until you have downloaded all the required files. diff --git a/docs/source/python/faq.rst b/docs/source/python/faq.rst index 9083d2794..439c0f0ac 100644 --- a/docs/source/python/faq.rst +++ b/docs/source/python/faq.rst @@ -22,6 +22,42 @@ How to install sherpa with CUDA support All you need to do is to install a CUDA version of PyTorch before installing ``sherpa`` +Could not find PyTorch +---------------------- + +If you have the following error while installing ``sherpa``: + +.. code-block:: + + CMake Error at cmake/torch.cmake:14 (find_package): + By not providing "FindTorch.cmake" in CMAKE_MODULE_PATH this project has + asked CMake to find a package configuration file provided by "Torch", but + CMake did not find one. + + Could not find a package configuration file provided by "Torch" with any of + the following names: + + TorchConfig.cmake + torch-config.cmake + + Add the installation prefix of "Torch" to CMAKE_PREFIX_PATH or set + "Torch_DIR" to a directory containing one of the above files. If "Torch" + provides a separate development package or SDK, be sure it has been + installed. + Call Stack (most recent call first): + CMakeLists.txt:120 (include) + +The fix is to install ``PyTorch`` first and retry. + +If it still does not work, please make sure you have used the same +(virtual) environment where ``PyTorch`` is installed to compile ``sherpa``. + +.. hint:: + + You can look for the path to the ``python3`` executable in the output of + cmake to find out which environment ``cmake`` is using. + + .. _fix cuDNN not found: How to fix `Caffe2: Cannot find cuDNN library` @@ -42,7 +78,23 @@ before running ``pip install --verbose k2-sherpa`` or ``python3 setup.py install .. hint:: - The above command assumes that you have installed cuDNN to ``/path/to/cudnn``. + The above command assumes that you have installed cuDNN to ``/path/to/cudnn`` + and you can find the following files: + + - ``/path/to/cudnn/lib/libcudnn.so`` + - ``/path/to/cudnn/include/cudnn.h`` + +.. hint:: + + If you are using ``conda``, you can use: + + .. code-block:: bash + + conda install cudnn + + to install ``cudnn``. And possibly you don't need to set the above + environment variable ``SHERPA_CMAKE_ARGS`` after you ran + ``conda install cudnn``. How to uninstall sherpa ----------------------- diff --git a/docs/source/python/installation/from-source.rst b/docs/source/python/installation/from-source.rst index 641434880..3315c6076 100644 --- a/docs/source/python/installation/from-source.rst +++ b/docs/source/python/installation/from-source.rst @@ -1,3 +1,5 @@ +.. _install_sherpa_from_source: + Install from source =================== diff --git a/docs/source/python/offline_asr/index.rst b/docs/source/python/offline_asr/index.rst index a6d4c0459..529b853de 100644 --- a/docs/source/python/offline_asr/index.rst +++ b/docs/source/python/offline_asr/index.rst @@ -1,12 +1,22 @@ -Non-streaming ASR -================= +Non-streaming speech recognition +================================ -This part describes how to use `sherpa`_ for non-streaming ASR. +This part describes how to use `sherpa`_ for non-streaming (i.e., offline) +speech recognition. + +Two types of models are supported: + + - `CTC`_ + - `transducer`_ + + +We support standalone speech recognition as well as server/client based +speech recognition using `WebSocket`_. -At present, only non-streaming ASR based on `Conformer`_ -`transducer`_ (i.e., Conformer-T) is implemented. .. toctree:: :maxdepth: 2 + standalone/index + conformer/index diff --git a/docs/source/python/offline_asr/standalone/ctc.rst b/docs/source/python/offline_asr/standalone/ctc.rst new file mode 100644 index 000000000..deea59c45 --- /dev/null +++ b/docs/source/python/offline_asr/standalone/ctc.rst @@ -0,0 +1,8 @@ +CTC +=== + +In this section, we describe how to use pre-trained `CTC`_ +models for offline (i.e., non-streaming) speech recognition. + +TODO: Add more content. + diff --git a/docs/source/python/offline_asr/standalone/index.rst b/docs/source/python/offline_asr/standalone/index.rst new file mode 100644 index 000000000..e49ca133b --- /dev/null +++ b/docs/source/python/offline_asr/standalone/index.rst @@ -0,0 +1,12 @@ +Standalone +========== + +In this section, we describe how to decode files with a standalone executable. +You don't need to start a server and a client for speech recognition. + + +.. toctree:: + :maxdepth: 2 + + transducer + ctc diff --git a/docs/source/python/offline_asr/standalone/log/transducer-fast-beam-search-with-LG.txt b/docs/source/python/offline_asr/standalone/log/transducer-fast-beam-search-with-LG.txt new file mode 100644 index 000000000..6220f8e0f --- /dev/null +++ b/docs/source/python/offline_asr/standalone/log/transducer-fast-beam-search-with-LG.txt @@ -0,0 +1,9 @@ +2023-01-05 08:53:40,615 INFO [offline_transducer_asr.py:317] {'nn_model': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt', 'tokens': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt', 'decoding_method': 'fast_beam_search', 'num_active_paths': 4, 'max_contexts': 8, 'max_states': 64, 'allow_partial': True, 'LG': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/LG.pt', 'ngram_lm_scale': 0.01, 'beam': 4.0, 'use_gpu': False, 'sound_files': ['./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav']} +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:127:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:41 WarmUp begins +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:140:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:41 WarmUp ended +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00,0.40,0.52,0.68,0.96,1.28,1.36,1.44,1.60,1.76,1.88,1.96,2.16,2.28,2.36,2.48,2.60,2.76,3.04,3.24,3.40,3.56,3.76,4.04,4.20,4.32,4.48,4.64,4.80,4.84,5.00,5.04,5.32,5.44,5.60,5.68,5.84,6.04,6.24]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00,0.12,0.40,0.64,0.80,0.96,1.04,1.12,1.28,1.44,1.64,1.72,1.84,1.96,2.12,2.28,2.36,2.60,2.84,3.16,3.28,3.48,3.60,3.76,3.92,4.12,4.36,4.52,4.72,4.92,5.12,5.40,5.68,6.04,6.24,6.48,6.84,7.08,7.32,7.56,7.84,8.12,8.24,8.32,8.44,8.56,8.76,8.88,9.08,9.28,9.44,9.56,9.64,9.76,9.96,10.04,10.20,10.40,10.64,10.76,11.00,11.20,11.36,11.56,11.76,12.00,12.08,12.28,12.32,12.52,12.68,12.84,12.96,13.04,13.20,13.36,13.60,13.76,13.96,14.12,14.24,14.36,14.52,14.68,14.76,15.04,15.28,15.52,15.76,16.00,16.16,16.24,16.32]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00,0.08,0.32,0.44,0.64,0.96,1.08,1.20,1.28,1.40,1.44,1.64,1.76,1.84,2.04,2.12,2.24,2.28,2.48,2.52,2.84,3.08,3.28,3.52,3.76,3.88,4.00,4.08,4.20,4.36,4.52]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} diff --git a/docs/source/python/offline_asr/standalone/log/transducer-fast-beam-search.txt b/docs/source/python/offline_asr/standalone/log/transducer-fast-beam-search.txt new file mode 100644 index 000000000..315fbf610 --- /dev/null +++ b/docs/source/python/offline_asr/standalone/log/transducer-fast-beam-search.txt @@ -0,0 +1,9 @@ +2023-01-05 08:53:33,089 INFO [offline_transducer_asr.py:317] {'nn_model': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt', 'tokens': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt', 'decoding_method': 'fast_beam_search', 'num_active_paths': 4, 'max_contexts': 8, 'max_states': 64, 'allow_partial': True, 'LG': '', 'ngram_lm_scale': 0.01, 'beam': 4.0, 'use_gpu': False, 'sound_files': ['./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav']} +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:127:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:34 WarmUp begins +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:140:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:34 WarmUp ended +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00,0.40,0.52,0.68,0.96,1.28,1.36,1.44,1.60,1.76,1.88,1.96,2.16,2.28,2.36,2.48,2.60,2.76,3.04,3.24,3.40,3.56,3.76,4.04,4.20,4.32,4.48,4.64,4.80,4.84,5.00,5.04,5.32,5.44,5.60,5.68,5.84,6.04,6.24]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00,0.12,0.40,0.64,0.80,0.96,1.04,1.12,1.28,1.44,1.64,1.72,1.84,1.96,2.12,2.28,2.36,2.60,2.84,3.16,3.28,3.48,3.60,3.76,3.92,4.12,4.36,4.52,4.72,4.92,5.12,5.40,5.68,6.04,6.24,6.48,6.84,7.08,7.32,7.56,7.84,8.12,8.24,8.32,8.44,8.56,8.76,8.88,9.08,9.28,9.44,9.56,9.64,9.76,9.96,10.04,10.20,10.40,10.64,10.76,11.00,11.20,11.36,11.56,11.76,12.00,12.08,12.28,12.32,12.52,12.68,12.84,12.96,13.04,13.20,13.36,13.60,13.76,13.96,14.12,14.24,14.36,14.52,14.68,14.76,15.04,15.28,15.52,15.76,16.00,16.16,16.24,16.32]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00,0.08,0.32,0.44,0.64,0.96,1.08,1.20,1.28,1.40,1.44,1.64,1.76,1.84,2.04,2.12,2.24,2.28,2.48,2.52,2.84,3.08,3.28,3.52,3.76,3.88,4.00,4.08,4.20,4.36,4.52]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} diff --git a/docs/source/python/offline_asr/standalone/log/transducer-greedy-search.txt b/docs/source/python/offline_asr/standalone/log/transducer-greedy-search.txt new file mode 100644 index 000000000..3594db0e9 --- /dev/null +++ b/docs/source/python/offline_asr/standalone/log/transducer-greedy-search.txt @@ -0,0 +1,9 @@ +2023-01-05 08:53:16,842 INFO [offline_transducer_asr.py:317] {'nn_model': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt', 'tokens': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt', 'decoding_method': 'greedy_search', 'num_active_paths': 4, 'max_contexts': 8, 'max_states': 64, 'allow_partial': True, 'LG': '', 'ngram_lm_scale': 0.01, 'beam': 4, 'use_gpu': False, 'sound_files': ['./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav']} +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:127:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:17 WarmUp begins +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:140:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:18 WarmUp ended +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00,0.40,0.52,0.68,0.96,1.28,1.36,1.44,1.60,1.76,1.88,1.96,2.16,2.28,2.36,2.48,2.60,2.76,3.04,3.24,3.40,3.56,3.76,4.04,4.20,4.32,4.48,4.64,4.80,4.84,5.00,5.04,5.32,5.44,5.60,5.68,5.84,6.04,6.24]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00,0.12,0.40,0.64,0.80,0.96,1.04,1.12,1.28,1.44,1.64,1.72,1.84,1.96,2.12,2.28,2.36,2.60,2.84,3.16,3.28,3.48,3.60,3.76,3.92,4.12,4.36,4.52,4.72,4.92,5.12,5.40,5.68,6.04,6.24,6.48,6.84,7.08,7.32,7.56,7.84,8.12,8.24,8.32,8.44,8.56,8.76,8.88,9.08,9.28,9.44,9.56,9.64,9.76,9.96,10.04,10.20,10.40,10.64,10.76,11.00,11.20,11.36,11.56,11.76,12.00,12.08,12.28,12.32,12.52,12.68,12.84,12.96,13.04,13.20,13.36,13.60,13.76,13.96,14.12,14.24,14.36,14.52,14.68,14.76,15.04,15.28,15.52,15.76,16.00,16.16,16.24,16.32]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00,0.08,0.32,0.44,0.64,0.96,1.08,1.20,1.28,1.40,1.44,1.64,1.76,1.84,2.04,2.12,2.24,2.28,2.48,2.52,2.84,3.08,3.28,3.52,3.76,3.88,4.00,4.08,4.20,4.36,4.52]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} diff --git a/docs/source/python/offline_asr/standalone/log/transducer-modified-beam-search.txt b/docs/source/python/offline_asr/standalone/log/transducer-modified-beam-search.txt new file mode 100644 index 000000000..44279f1ea --- /dev/null +++ b/docs/source/python/offline_asr/standalone/log/transducer-modified-beam-search.txt @@ -0,0 +1,9 @@ +2023-01-05 08:53:25,515 INFO [offline_transducer_asr.py:317] {'nn_model': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt', 'tokens': './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt', 'decoding_method': 'modified_beam_search', 'num_active_paths': 4, 'max_contexts': 8, 'max_states': 64, 'allow_partial': True, 'LG': '', 'ngram_lm_scale': 0.01, 'beam': 4, 'use_gpu': False, 'sound_files': ['./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav']} +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:127:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:26 WarmUp begins +[I] /content/sherpa/sherpa/cpp_api/offline-recognizer-transducer-impl.h:140:void sherpa::OfflineRecognizerTransducerImpl::WarmUp() 2023-01-05 08:53:26 WarmUp ended +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav +{"text":" AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS","timestamps":"[0.00,0.40,0.52,0.68,0.96,1.24,1.36,1.44,1.60,1.76,1.88,1.96,2.16,2.28,2.36,2.48,2.60,2.76,3.04,3.24,3.40,3.56,3.76,4.04,4.20,4.32,4.48,4.64,4.80,4.84,5.00,5.04,5.32,5.44,5.60,5.68,5.84,6.04,6.24]","tokens":[" AFTER"," E","AR","LY"," NIGHT","F","A","LL"," THE"," YE","LL","OW"," LA","M","P","S"," WOULD"," LIGHT"," UP"," HE","RE"," AND"," THERE"," THE"," S","QUA","LI","D"," ","QUA","R","TER"," OF"," THE"," B","RO","TH","EL","S"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav +{"text":" GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN","timestamps":"[0.00,0.12,0.40,0.64,0.80,0.96,1.04,1.12,1.28,1.44,1.64,1.72,1.84,1.96,2.12,2.28,2.36,2.60,2.84,3.16,3.28,3.48,3.60,3.76,3.92,4.12,4.36,4.52,4.72,4.92,5.12,5.40,5.68,6.04,6.24,6.48,6.84,7.08,7.32,7.56,7.84,8.12,8.24,8.32,8.44,8.56,8.76,8.88,9.08,9.28,9.44,9.56,9.64,9.76,9.96,10.04,10.20,10.40,10.64,10.76,11.00,11.20,11.36,11.56,11.76,12.00,12.08,12.28,12.32,12.52,12.68,12.84,12.96,13.04,13.20,13.36,13.60,13.76,13.96,14.12,14.24,14.36,14.52,14.68,14.76,15.04,15.28,15.52,15.76,16.00,16.16,16.24,16.32]","tokens":[" GO","D"," AS"," A"," DI","RE","C","T"," CON","SE","QUE","N","CE"," OF"," THE"," S","IN"," WHICH"," MAN"," TH","US"," P","UN","ISH","ED"," HAD"," GIVE","N"," HER"," A"," LOVE","LY"," CHILD"," WHO","SE"," PLACE"," WAS"," ON"," THAT"," SAME"," DIS","HO","N","OUR","ED"," BO","S","OM"," TO"," CON","NE","C","T"," HER"," P","AR","ENT"," FOR"," E","VER"," WITH"," THE"," RA","CE"," AND"," DE","S","C","ENT"," OF"," MO","R","T","AL","S"," AND"," TO"," BE"," FI","N","AL","LY"," A"," B","LESS","ED"," SO","UL"," IN"," HE","A","VE","N"]} +./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav +{"text":" YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION","timestamps":"[0.00,0.08,0.32,0.44,0.64,0.96,1.08,1.20,1.28,1.40,1.44,1.64,1.76,1.84,2.04,2.12,2.24,2.32,2.48,2.52,2.84,3.08,3.28,3.52,3.76,3.88,4.00,4.08,4.20,4.36,4.52]","tokens":[" YE","T"," THE","SE"," THOUGHT","S"," A","FF","E","C","TED"," HE","S","TER"," P","RY","N","NE"," ","LESS"," WITH"," HO","PE"," THAN"," A","PP","RE","HE","N","S","ION"]} diff --git a/docs/source/python/offline_asr/standalone/transducer.rst b/docs/source/python/offline_asr/standalone/transducer.rst new file mode 100644 index 000000000..2da3ff4cc --- /dev/null +++ b/docs/source/python/offline_asr/standalone/transducer.rst @@ -0,0 +1,158 @@ +Transducer +========== + +In this section, we describe how to use pre-trained `transducer`_ +models for offline (i.e., non-streaming) speech recognition. + +.. hint:: + + Please refer to :ref:`offline_transducer_pretrained_models` for a list of + available pre-trained `transducer`_ models to download. + +.. hint:: + + We have a colab notebook for this section: |sherpa python offline transducer standalone recognizer colab notebook| + + .. |sherpa python offline transducer standalone recognizer colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa/sherpa_standalone_offline_transducer_speech_recognition.ipynb + + You can find the following in the above colab notebook: + + - how to setup the environment + - how to download pre-trained models + - how to use sherpa for speech recognition + + If you don't have access to Google, please find below the colab notebook + in our GitHub repo. For instance, the above colab notebook can be found + at ``_ + +.. note:: + + Please visit ``_ for a list of available + colab notebooks about the next-gen Kaldi project. + +In the following, we use the pre-trained model +:ref:`icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02` +to demonstrate how to decode sound files. + +.. caution:: + + Make sure you have installed `sherpa`_ before you continue. + + Please refer to :ref:`install_sherpa_from_source` to install `sherpa`_ + from source. + +Download the pre-trained model +------------------------------ + +Please refer to :ref:`icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02` +for detailed instructions. + +For ease of reference, we duplicate the download commands below: + +.. code-block:: bash + + cd /path/to/sherpa + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 + cd icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 + git lfs pull --include "exp/cpu_jit-torch-1.10.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + +In the following, we describe different decoding methods. + +greedy search +------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method greedy_search \ + --num-active-paths 4 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +The output of the above command is given below: + +.. literalinclude:: ./log/transducer-greedy-search.txt + :caption: Output of greedy search + +modified beam search +-------------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method modified_beam_search \ + --num-active-paths 4 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +The output of the above command is given below: + +.. literalinclude:: ./log/transducer-modified-beam-search.txt + :caption: Output of modified beam search + +fast beam search (without LG) +----------------------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method fast_beam_search \ + --max-contexts 8 \ + --max-states 64 \ + --allow-partial true \ + --beam 4 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +The output of the above command is given below: + +.. literalinclude:: ./log/transducer-fast-beam-search.txt + :caption: Output of fast beam search (without LG) + +fast beam search (with LG) +-------------------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method fast_beam_search \ + --max-contexts 8 \ + --max-states 64 \ + --allow-partial true \ + --beam 4 \ + --LG ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/LG.pt \ + --ngram-lm-scale 0.01 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +The output of the above command is given below: + +.. literalinclude:: ./log/transducer-fast-beam-search-with-LG.txt + :caption: Output of fast beam search (with LG) diff --git a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/client.rst b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/client.rst index 10b7abb1b..549efebde 100644 --- a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/client.rst +++ b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/client.rst @@ -14,7 +14,7 @@ We provide a web client for this purpose. Also, we have hard coded the server port to 6006. Please either pass ``--port 6006`` when starting the server or change the client - ``_ + ``_ to use whaterver the port the server is using. Usage diff --git a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/server.rst b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/server.rst index b2fd7eb0e..311e03b55 100644 --- a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/server.rst +++ b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_Chinese/server.rst @@ -52,6 +52,7 @@ The following shows you how to start the server with the above pretrained model. git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming ./sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ --port 6006 \ --max-batch-size 50 \ --max-wait-ms 5 \ diff --git a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/client.rst b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/client.rst index b22315c8f..126a266f9 100644 --- a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/client.rst +++ b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/client.rst @@ -14,7 +14,7 @@ We provide a web client for this purpose. Also, we have hard coded the server port to 6006. Please either pass ``--port 6006`` when starting the server or change the client - ``_ + ``_ to use whaterver the port the server is using. Usage diff --git a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/server.rst b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/server.rst index db162bc81..46fe04f7e 100644 --- a/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/server.rst +++ b/docs/source/python/streaming_asr/conformer/conformer_rnnt_for_English/server.rst @@ -52,6 +52,7 @@ The following shows you how to start the server with the above pretrained model. git clone https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 ./sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ --port 6006 \ --max-batch-size 50 \ --max-wait-ms 5 \ diff --git a/docs/source/python/streaming_asr/conv_emformer/client.rst b/docs/source/python/streaming_asr/conv_emformer/client.rst index a96bd5963..0fffe9908 100644 --- a/docs/source/python/streaming_asr/conv_emformer/client.rst +++ b/docs/source/python/streaming_asr/conv_emformer/client.rst @@ -14,7 +14,7 @@ We provide a web client for this purpose. Also, we have hard coded the server port to 6006. Please either pass ``--port 6006`` when starting the server or change the client - ``_ + ``_ to use whaterver the port the server is using. Usage diff --git a/docs/source/python/streaming_asr/conv_emformer/server.rst b/docs/source/python/streaming_asr/conv_emformer/server.rst index a509fbf08..1fd39ee0f 100644 --- a/docs/source/python/streaming_asr/conv_emformer/server.rst +++ b/docs/source/python/streaming_asr/conv_emformer/server.rst @@ -52,6 +52,7 @@ The following shows you how to start the server with the above pretrained model. git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 ./sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ --port 6007 \ --max-batch-size 50 \ --max-wait-ms 5 \ diff --git a/docs/source/python/streaming_asr/emformer/client.rst b/docs/source/python/streaming_asr/emformer/client.rst index 150899ae0..0a9c0fc02 100644 --- a/docs/source/python/streaming_asr/emformer/client.rst +++ b/docs/source/python/streaming_asr/emformer/client.rst @@ -14,7 +14,7 @@ We provide a web client for this purpose. Also, we have hard coded the server port to 6006. Please either pass ``--port 6006`` when starting the server or change the client - ``_ + ``_ to use whaterver the port the server is using. Usage diff --git a/docs/source/python/streaming_asr/emformer/server.rst b/docs/source/python/streaming_asr/emformer/server.rst index d77d16f54..36b9a1173 100644 --- a/docs/source/python/streaming_asr/emformer/server.rst +++ b/docs/source/python/streaming_asr/emformer/server.rst @@ -52,6 +52,7 @@ The following shows you how to start the server with the above pretrained model. git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 ./sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ --port 6007 \ --max-batch-size 50 \ --max-wait-ms 5 \ diff --git a/docs/source/python/streaming_asr/endpointing.rst b/docs/source/python/streaming_asr/endpointing.rst index 58d5200c8..f0b13691d 100644 --- a/docs/source/python/streaming_asr/endpointing.rst +++ b/docs/source/python/streaming_asr/endpointing.rst @@ -80,3 +80,38 @@ we say that an endpoint is detected. --endpoint.rule3.must-contain-nonsilence=false --endpoint.rule3.min-trailing-silence=0.0 --endpoint.rule3.min-utterance-length=20.0 + +Endpointing Demo (English) +-------------------------- + +The following video shows an endpointing demo using a pretrained model +on the `LibriSpeech`_ dataset. You can find the usage in +:ref:`lstm_server_english`. + +.. youtube:: 4XsTXt_9_SY + :width: 120% + +Endpointing Demo (Chinese) +-------------------------- + +The following two videos show endpointing demos using a pretrained model +on the `WenetSpeech`_ dataset. You can find the usage in +:ref:`lstm_server_chinese`. + +Short Demo +^^^^^^^^^^ + +.. youtube:: sRQPGMZFun4 + :width: 120% + +Long Demo +^^^^^^^^^ + +.. youtube:: LJtPJmX5jpE + :width: 120% + +Endpointing Demo (Arabic) +------------------------- + +.. youtube:: t2SlrzgMd_k + :width: 120% diff --git a/docs/source/python/streaming_asr/images/secure-connections/1.png b/docs/source/python/streaming_asr/images/secure-connections/1.png new file mode 100644 index 000000000..f0b8730f1 Binary files /dev/null and b/docs/source/python/streaming_asr/images/secure-connections/1.png differ diff --git a/docs/source/python/streaming_asr/images/secure-connections/2.png b/docs/source/python/streaming_asr/images/secure-connections/2.png new file mode 100644 index 000000000..ac968e7fb Binary files /dev/null and b/docs/source/python/streaming_asr/images/secure-connections/2.png differ diff --git a/docs/source/python/streaming_asr/images/secure-connections/3.png b/docs/source/python/streaming_asr/images/secure-connections/3.png new file mode 100644 index 000000000..7c4a9fc9a Binary files /dev/null and b/docs/source/python/streaming_asr/images/secure-connections/3.png differ diff --git a/docs/source/python/streaming_asr/index.rst b/docs/source/python/streaming_asr/index.rst index 6f9612aa0..daace7716 100644 --- a/docs/source/python/streaming_asr/index.rst +++ b/docs/source/python/streaming_asr/index.rst @@ -3,14 +3,26 @@ Streaming ASR This page describes how to use `sherpa`_ for streaming ASR. -Currently, implemented streaming ASR models include: -`Emformer`_ `transducer`_ (i.e., Emformer-T) -and `ConvEmformer`_ `transducer`_ (i.e., ConvEmformer-T). +The following types of transducer models are supported: + + - `Emformer`_ + - `ConvEmformer`_ + - ``LSTM`` + - `Zipformer`_ + + +We support standalone speech recognition as well as server/client based +speech recognition using `WebSocket`_. + .. toctree:: :maxdepth: 3 + standalone/index + endpointing + secure-connections emformer/index conv_emformer/index conformer/index + lstm/index diff --git a/docs/source/python/streaming_asr/lstm/chinese/client.rst b/docs/source/python/streaming_asr/lstm/chinese/client.rst new file mode 100644 index 000000000..af758cb05 --- /dev/null +++ b/docs/source/python/streaming_asr/lstm/chinese/client.rst @@ -0,0 +1,55 @@ +.. _lstm_client_chinese: + +Client +====== + +With the client you can record your voice in real-time, send it to the +server, and get the recognition results back from the server. + +We provide a web client for this purpose. + +.. caution:: + + Please first start the :ref:`lstm_server_chinese` before you start the client. + + Also, we have hard coded the server port to 6006. Please either pass + ``--port 6006`` when starting the server or change the client + ``_ + to use whaterver the port the server is using. + +Usage +----- + +.. code-block:: bash + + cd /path/to/sherpa + cd ./sherpa/bin/web + python3 -m http.server 6008 + +Then open your browser, and visit ``_. + +You will see a UI like the following screenshot. Click the ``Record`` button +and speak! You should see the recognition results from the server. + + +.. image:: /_static/emformer-streaming-asr-web-client.png + :alt: Screen shot of the web client user interface + +.. note:: + + If you are unable to click the ``Record`` button, please make sure + the server port is 6006. + +.. caution:: + + You have to visit ``_, not + ``_. Otherwise, you will not be able + to use the microphone in the browser. One way to avoid this is to use ``https``, + but that needs a certificate. + +.. hint:: + + If you are using Chrome, you can right click the page, and then click + ``inspect`` in the popup menu, and then click ``console``. You will see + some diagnostic message. This helps you to debug if you are unable to click + the ``Record`` button. diff --git a/docs/source/python/streaming_asr/lstm/chinese/server.rst b/docs/source/python/streaming_asr/lstm/chinese/server.rst new file mode 100644 index 000000000..c5ac4ef8a --- /dev/null +++ b/docs/source/python/streaming_asr/lstm/chinese/server.rst @@ -0,0 +1,67 @@ +.. _lstm_server_chinese: + +Server +====== + +.. hint:: + + Please first refer to :ref:`installation` to install `sherpa`_ + before proceeding. + +The server is responsible for accepting audio samples from the client, +decoding it, and sending the recognition results back to the client. + + +Usage +----- + +.. code-block:: + + cd /path/to/sherpa + ./sherpa/bin/lstm_transducer_stateless/streaming_server.py --help + +shows the usage message. + +You need the following files to start the server: + + 1. The neural network model + 2. The ``tokens.txt``. + +The neural network model has three parts, the encoder, the decoder, and +the joiner, which are all exported using ``torch.jit.trace``. + +The above files can be obtained after training your model +with ``_. + +If you don't want to train a model by yourself, you can try the +pretrained model: ``_ + +The following shows you how to start the server with the above pretrained model. + +.. code-block:: bash + + cd /path/to/sherpa + + git lfs install + git clone https://huggingface.co/csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-09-19 + + ./sherpa/bin/lstm_transducer_stateless/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ + --port 6007 \ + --max-batch-size 50 \ + --max-wait-ms 5 \ + --nn-pool-size 1 \ + --nn-encoder-filename ./icefall-asr-wenetspeech-lstm-transducer-stateless-2022-09-19/exp/encoder_jit_trace-iter-420000-avg-10.pt \ + --nn-decoder-filename ./icefall-asr-wenetspeech-lstm-transducer-stateless-2022-09-19/exp/decoder_jit_trace-iter-420000-avg-10.pt \ + --nn-joiner-filename ./icefall-asr-wenetspeech-lstm-transducer-stateless-2022-09-19/exp/joiner_jit_trace-iter-420000-avg-10.pt \ + --token-filename ./icefall-asr-wenetspeech-lstm-transducer-stateless-2022-09-19/data/lang_char/tokens.txt + +That's it! + +Now you can start the :ref:`lstm_client_chinese`, record your voice in real-time, +and check the recognition results from the server. + +.. warning:: + + The above pretrained model has been trained only for 6 epochs. We will + update it in the following days. diff --git a/docs/source/python/streaming_asr/lstm/english/client.rst b/docs/source/python/streaming_asr/lstm/english/client.rst new file mode 100644 index 000000000..50cb890d3 --- /dev/null +++ b/docs/source/python/streaming_asr/lstm/english/client.rst @@ -0,0 +1,55 @@ +.. _lstm_client_english: + +Client +====== + +With the client you can record your voice in real-time, send it to the +server, and get the recognition results back from the server. + +We provide a web client for this purpose. + +.. caution:: + + Please first start the :ref:`lstm_server_english` before you start the client. + + Also, we have hard coded the server port to 6006. Please either pass + ``--port 6006`` when starting the server or change the client + ``_ + to use whaterver the port the server is using. + +Usage +----- + +.. code-block:: bash + + cd /path/to/sherpa + cd ./sherpa/bin/web + python3 -m http.server 6008 + +Then open your browser, and visit ``_. + +You will see a UI like the following screenshot. Click the ``Record`` button +and speak! You should see the recognition results from the server. + + +.. image:: /_static/emformer-streaming-asr-web-client.png + :alt: Screen shot of the web client user interface + +.. note:: + + If you are unable to click the ``Record`` button, please make sure + the server port is 6006. + +.. caution:: + + You have to visit ``_, not + ``_. Otherwise, you will not be able + to use the microphone in the browser. One way to avoid this is to use ``https``, + but that needs a certificate. + +.. hint:: + + If you are using Chrome, you can right click the page, and then click + ``inspect`` in the popup menu, and then click ``console``. You will see + some diagnostic message. This helps you to debug if you are unable to click + the ``Record`` button. diff --git a/docs/source/python/streaming_asr/lstm/english/server.rst b/docs/source/python/streaming_asr/lstm/english/server.rst new file mode 100644 index 000000000..fb309b0c8 --- /dev/null +++ b/docs/source/python/streaming_asr/lstm/english/server.rst @@ -0,0 +1,102 @@ +.. _lstm_server_english: + +Server +====== + +.. hint:: + + Please first refer to :ref:`installation` to install `sherpa`_ + before proceeding. + +The server is responsible for accepting audio samples from the client, +decoding it, and sending the recognition results back to the client. + + +Usage +----- + +.. code-block:: + + cd /path/to/sherpa + ./sherpa/bin/lstm_transducer_stateless/streaming_server.py --help + +shows the usage message. + +You need the following files to start the server: + + 1. The neural network model + 2. The BPE model ``bpe.model``. + +The neural network model has three parts, the encoder, the decoder, and +the joiner, which are all exported using ``torch.jit.trace``. + +The above two files can be obtained after training your model +with ``_. + +If you don't want to train a model by yourself, you can try the +pretrained model: ``_ + + +.. hint:: + + You can find pretrained models in ``RESULTS.md`` for all the recipes in + `icefall `_. + + For instance, the pretrained models for the LibriSpeech dataset can be + found at ``_. + +The following shows you how to start the server with the above pretrained model. + +.. code-block:: bash + + cd /path/to/sherpa + + git lfs install + git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18 + + ./sherpa/bin/lstm_transducer_stateless/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ + --port 6007 \ + --max-batch-size 50 \ + --max-wait-ms 5 \ + --nn-pool-size 1 \ + --nn-encoder-filename ./icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18/exp/encoder_jit_trace.pt \ + --nn-decoder-filename ./icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18/exp/decoder_jit_trace.pt \ + --nn-joiner-filename ./icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18/exp/joiner_jit_trace.pt \ + --bpe-model-filename ./icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18/data/lang_bpe_500/bpe.model + +That's it! + +Now you can start the :ref:`lstm_client_english`, record your voice in real-time, +and check the recognition results from the server. + +.. hint:: + + You can also try the following pretrained model trained using `GigaSpeech`_ + and `LibriSpeech`_ and has a lower WER than the above one: + + .. code-block:: bash + + git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 + + nn_encoder_filename=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-iter-468000-avg-16.pt + nn_decoder_filename=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-iter-468000-avg-16.pt + nn_joiner_filename=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-iter-468000-avg-16.pt + + bpe_model_filename=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/bpe.model + + ./sherpa/bin/lstm_transducer_stateless/streaming_server.py \ + --endpoint.rule1.must-contain-nonsilence=false \ + --endpoint.rule1.min-trailing-silence=5.0 \ + --endpoint.rule2.min-trailing-silence=2.0 \ + --endpoint.rule3.min-utterance-length=50.0 \ + --port 6006 \ + --decoding-method greedy_search \ + --max-batch-size 50 \ + --max-wait-ms 5 \ + --nn-pool-size 1 \ + --max-active-connections 10 \ + --nn-encoder-filename $nn_encoder_filename \ + --nn-decoder-filename $nn_decoder_filename \ + --nn-joiner-filename $nn_joiner_filename \ + --bpe-model-filename $bpe_model_filename diff --git a/docs/source/python/streaming_asr/lstm/index.rst b/docs/source/python/streaming_asr/lstm/index.rst new file mode 100644 index 000000000..172cc50a5 --- /dev/null +++ b/docs/source/python/streaming_asr/lstm/index.rst @@ -0,0 +1,49 @@ +LSTM transducer based streaming ASR +=========================================== + +This page describes how to use `sherpa`_ for streaming +ASR with `LSTM` transducer models +trained with `pruned stateless transdcuer `_. + +.. hint:: + + To be specific, the pre-trained model for English is trained on the `LibriSpeech`_ + dataset using the code from + ``_. + + The pre-trained model for English can be downloaded from + ``_ + + While the pretrained model for Chinese is trained on the `WenetSpeech`_ + dataset. The model can be downloaded from + ``_ + +There are no **recurrent** modules in the transducer model: + + - The encoder network (i.e., the transcription network) is a LSTM model + - The decoder network (i.e., the prediction network) is a + `stateless network `_, + consisting of an ``nn.Embedding()`` and a ``nn.Conv1d()``. + - The joiner network (i.e., the joint network) contains an adder, + a ``tanh`` activation, and a ``nn.Linear()``. + + +We provide examples for following two languages: + +English +------- + +.. toctree:: + :maxdepth: 2 + + english/server + english/client + +Chinese +------- + +.. toctree:: + :maxdepth: 2 + + chinese/server + chinese/client diff --git a/docs/source/python/streaming_asr/secure-connections.rst b/docs/source/python/streaming_asr/secure-connections.rst new file mode 100644 index 000000000..81b959231 --- /dev/null +++ b/docs/source/python/streaming_asr/secure-connections.rst @@ -0,0 +1,123 @@ +Secure connections +================== + +In this section, we describe how to use ``https`` and +secure websocket with ``sherpa``. + +.. hint:: + + If you don't use ``https``, you have to use ``http://localhost:port``. + Otherwise, you won't be able to use the microphone within your browser. + + +Generate a certificate +---------------------- + +First, you need to have a `X.509 certificate `_. +If you don't have one, we provide the following command to generate a +``self-signed`` certificate for you: + +.. code-block:: + + cd sherpa/bin/web + ./generate-certificate.py + +It will generate a file ``cert.pem``, which is the self-signed certificate. + +.. caution:: + + You have to make your browser trust the self-signed certificate later + for both the https server and the websocket server. The reason to trust + both servers is that they are running on different ports. + +Start the https server +---------------------- + + +.. code-block:: bash + + cd sherpa/bin/web + ./start-https-server.py \ + --server-address 0.0.0.0 \ + --server-port 6007 \ + --certificate cert.pem + +``0.0.0.0`` means the https server will listen on all IP addresses of the +current machine. + +If you are using Firefox, you can visit ``_, which +will show you the following page: + +.. figure:: ./images/secure-connections/1.png + :alt: Your-connection-is-not-secure + :align: center + :figwidth: 600px + + Click the ``Advanced`` button. + +.. hint:: + + You get the above message because you are using a self-signed certificate. + Also, you can use one of the public IP addresses of your machine to + replace ``0.0.0.0`` in ``_. + +After clicking the button ``Advanced``, you will see the following page: + +.. figure:: ./images/secure-connections/2.png + :alt: After-clicking-the-advanced-button + :align: center + :figwidth: 600px + + After clicking the ``Advanced`` button. + +Now click ``Add exception`` and then click "Confirm security exception" below: + +.. figure:: ./images/secure-connections/3.png + :alt: Click-confirm-security-exception + :align: center + :figwidth: 600px + + Click ``Confirm security exception``. + After clicking the advanced button + +At this point, your browser should trust your self-signed certificate +for the host ``0.0.0.0:6007``. + +One thing left is that you should also make your browser trust the +certificate of the websocket server. Otherwise, you won't be able +to make a connection to the websocket server. + +Assume your websocket server runs on the same machine as your https +server but uses the port ``6006``. You can start a https server on +port ``6006`` and repeat the above steps to make your browser +trust the certificate for the host ``0.0.0.0:6006`` and then kill +the https server running on port ``6006``. + +Start the websocket server +-------------------------- + +.. note:: + + We use ``sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py`` + as a demo below. The steps should be similar for starting other + streaming servers. + +.. code-block:: bash + + cd /path/to/sherpa + + git lfs install + git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + + ./sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py \ + --endpoint.rule3.min-utterance-length 1000.0 \ + --port 6006 \ + --max-batch-size 50 \ + --max-wait-ms 5 \ + --nn-pool-size 1 \ + --nn-model-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt \ + --bpe-model-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/bpe.model \ + --certificate ./sherpa/bin/web/cert.pem + +Now visit ``_ and you should be able to make a secure +connection to the websocket server ``wss://0.0.0.0:6006``. diff --git a/docs/source/python/streaming_asr/standalone/index.rst b/docs/source/python/streaming_asr/standalone/index.rst new file mode 100644 index 000000000..2a2437db9 --- /dev/null +++ b/docs/source/python/streaming_asr/standalone/index.rst @@ -0,0 +1,12 @@ +Standalone +========== + +In this section, we describe how to decode files with a standalone executable. +You don't need to start a server and a client for speech recognition. + + +.. toctree:: + :maxdepth: 2 + + transducer + diff --git a/docs/source/python/streaming_asr/standalone/transducer.rst b/docs/source/python/streaming_asr/standalone/transducer.rst new file mode 100644 index 000000000..ead2575b8 --- /dev/null +++ b/docs/source/python/streaming_asr/standalone/transducer.rst @@ -0,0 +1,106 @@ +Transducer +========== + + +In this section, we describe how to use pre-trained `transducer`_ +models for online (i.e., streaming) speech recognition. + +.. hint:: + + Please refer to :ref:`online_transducer_pretrained_models` for a list of + available pre-trained `transducer`_ models to download. + +In the following, we use the pre-trained model +:ref:`icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29` +to demonstrate how to decode sound files. + +.. caution:: + + Make sure you have installed `sherpa`_ before you continue. + + Please refer to :ref:`install_sherpa_from_source` to install `sherpa`_ + from source. + +Download the pre-trained model +------------------------------ + +Please refer to :ref:`icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29` +for detailed instructions. + +For ease of reference, we duplicate the download commands below: + +.. code-block:: bash + + # This model is trained using LibriSpeech with streaming zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/787 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + +In the following, we describe different decoding methods. + +greedy search +------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="greedy_search" \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav + +modified beam search +-------------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="modified_beam_search" \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav + +fast_beam_search +---------------- + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="fast_beam_search" \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav + +fast_beam_search with LG +------------------------ + +.. code-block:: bash + + cd /path/to/sherpa + + python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="fast_beam_search" \ + --LG=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/LG.pt \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav diff --git a/docs/source/sherpa/index.rst b/docs/source/sherpa/index.rst new file mode 100644 index 000000000..a467bcd75 --- /dev/null +++ b/docs/source/sherpa/index.rst @@ -0,0 +1,21 @@ +sherpa +====== + +.. hint:: + + During speech recognition, it does not need to access the Internet. + Everyting is processed locally on your device. + +`k2-fsa/sherpa`_ use `PyTorch`_ for neural network computation. + +https://k2-fsa.github.io/icefall/model-export/export-with-torch-jit-script.html>`_ +for how to export models. + +In the following, we describe how to use `k2-fsa/sherpa`_. + +.. toctree:: + :maxdepth: 2 + + ./install/index.rst + ./pretrained_models/index.rst + diff --git a/docs/source/sherpa/install/check_your_installation.rst b/docs/source/sherpa/install/check_your_installation.rst new file mode 100644 index 000000000..b744bb518 --- /dev/null +++ b/docs/source/sherpa/install/check_your_installation.rst @@ -0,0 +1,29 @@ +.. _check_sherpa_installation: + +Check your installation +======================= + +To check that you have installed ``k2-fsa/sherpa`` successfully, please run: + +.. code-block:: bash + + python3 -c "import sherpa; print(sherpa.__file__); print(sherpa.__version__)" + + sherpa-online --help + sherpa-offline --help + + sherpa-online-microphone --help + sherpa-offline-microphone --help + + sherpa-online-websocket-server --help + sherpa-online-websocket-client --help + sherpa-online-websocket-client-microphone --help + + sherpa-offline-websocket-server --help + sherpa-offline-websocket-client --help + + +Congratulations! You have installed `k2-fsa/sherpa`_ successfully. Please +refer to :ref:`k2_fsa_sherpa_pretrained_models` to download pre-trained models. + +Have fun with `k2-fsa/sherpa`_! diff --git a/docs/source/sherpa/install/from_source.rst b/docs/source/sherpa/install/from_source.rst new file mode 100644 index 000000000..018a01625 --- /dev/null +++ b/docs/source/sherpa/install/from_source.rst @@ -0,0 +1,114 @@ +.. _install_sherpa_from_source: + +From source +=========== + +This section describe how to install ``k2-fsa/sherpa`` from source. + + +Install dependencies +-------------------- + +Before installing ``k2-fsa/sherpa`` from source, we have to install the following +dependencies. + + - `PyTorch`_ + - `k2`_ + - `kaldifeat`_ + +.. tabs:: + + .. tab:: CPU + + Suppose that we select ``torch==2.0.1``. We can use the following + commands to install the dependencies: + + .. tabs:: + + .. tab:: Linux + + .. code-block:: bash + + pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==1.24.4.dev20231220+cpu.torch2.0.1 -f https://k2-fsa.github.io/k2/cpu.html + pip install kaldifeat==1.25.3.dev20231221+cpu.torch2.0.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html + + .. tab:: macOS + + .. code-block:: bash + + pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==1.24.4.dev20231220+cpu.torch2.0.1 -f https://k2-fsa.github.io/k2/cpu.html + pip install kaldifeat==1.25.3.dev20231221+cpu.torch2.0.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html + + .. tab:: Windows + + To be done. + + .. tab:: CUDA + + Suppose that we select ``torch==2.0.1+cu117``. We can use the following + commands to install the dependencies: + + .. code-block:: bash + + pip install torch==2.0.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==1.24.4.dev20231220+cuda11.7.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html + pip install kaldifeat==1.25.3.dev20231221+cuda11.7.torch2.0.1 -f https://csukuangfj.github.io/kaldifeat/cuda.html + + Next, please follow ``_ to install CUDA toolkit. + +Now we can start to build ``k2-fsa/sherpa`` from source. + +For general users +----------------- + +You can use the following commands to install `k2-fsa/sherpa`_: + +.. code-block:: bash + + # Please make sure you have installed PyTorch, k2, and kaldifeat + # before you continue + # + git clone http://github.com/k2-fsa/sherpa + cd sherpa + python3 -m pip install --verbose . + +To uninstall `k2-fsa/sherpa`_, please use + +.. code-block:: bash + + # Please run it outside of the k2-fsa/sherpa repo + # + pip uninstall k2-sherpa + +Please see :ref:`check_sherpa_installation`. + +For developers and advanced users +--------------------------------- + +You can also use the following commands to install `k2-fsa/sherpa`_. + +The advantage is that you can have several versions of `k2-fsa/sherpa`_ +in a single environment. + +.. code-block:: bash + + git clone http://github.com/k2-fsa/sherpa + cd sherpa + mkdir build + cd build + + # For torch >= 2.0, please use + # + # cmake -DCMAKE_CXX_STANDARD=17 .. + # + + cmake .. + make -j + + export PATH=$PWD/bin:$PATH + export PYTHONPATH=$PWD/lib:$PWD/../sherpa/python:$PYTHONPATH + +Please see :ref:`check_sherpa_installation`. + diff --git a/docs/source/sherpa/install/from_wheels.rst b/docs/source/sherpa/install/from_wheels.rst new file mode 100644 index 000000000..a89373096 --- /dev/null +++ b/docs/source/sherpa/install/from_wheels.rst @@ -0,0 +1,93 @@ +.. _install_sherpa_from_pre_compiled_wheels: + +From pre-compiled wheels +======================== + +.. note:: + + This method supports only Linux and macOS for now. If you want to + use Windows, please refer to :ref:`install_sherpa_from_source`. + +You can find a list of pre-compiled wheels at the following URLs: + + - CPU: ``_ + - CUDA: ``_ + +In the following, we demonstrate how to install ``k2-fsa/sherpa`` from +pre-compiled wheels. + +Linux (CPU) +----------- + +Suppose that we want to install the following wheel + +.. code-block:: bash + + https://huggingface.co/csukuangfj/kaldifeat/resolve/main/ubuntu-cpu/k2_sherpa-1.3.dev20230725+cpu.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +we can use the following methods: + +.. code-block:: bash + + # Before installing k2-fsa/sherpa, we have to install the following dependencies: + # torch, k2, and kaldifeat + + pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==1.24.4.dev20231220+cpu.torch2.0.1 -f https://k2-fsa.github.io/k2/cpu.html + pip install kaldifeat==1.25.3.dev20231221+cpu.torch2.0.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html + + # Now we can install k2-fsa/sherpa + pip install k2_sherpa==1.3.dev20230725+cpu.torch2.0.1 -f https://k2-fsa.github.io/sherpa/cpu.html + +Please see :ref:`check_sherpa_installation`. + +macOS (CPU) +----------- + +Suppose that we want to install the following wheel + +.. code-block:: bash + + https://huggingface.co/csukuangfj/kaldifeat/resolve/main/macos/k2_sherpa-1.3.dev20230725+cpu.torch2.0.1-cp311-cp311-macosx_10_9_x86_64.whl + +we can use the following methods: + +.. code-block:: bash + + # Before installing k2-fsa/sherpa, we have to install the following dependencies: + # torch, k2, and kaldifeat + + pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==1.24.4.dev20231220+cpu.torch2.0.1 -f https://k2-fsa.github.io/k2/cpu.html + pip install kaldifeat==1.25.3.dev20231221+cpu.torch2.0.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html + + # Now we can install k2-fsa/sherpa + pip install k2_sherpa==1.3.dev20230725+cpu.torch2.0.1 -f https://k2-fsa.github.io/sherpa/cpu.html + +Please see :ref:`check_sherpa_installation`. + +Linux (CUDA) +------------ + +Suppose that we want to install the following wheel + +.. code-block:: bash + + https://huggingface.co/csukuangfj/kaldifeat/resolve/main/ubuntu-cuda/k2_sherpa-1.3.dev20230725+cuda11.7.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +we can use the following methods: + +.. code-block:: bash + + # Before installing k2-fsa/sherpa, we have to install the following dependencies: + # torch, k2, and kaldifeat + + pip install torch==2.0.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==1.24.4.dev20231220+cuda11.7.torch2.0.1 -f https://k2-fsa.github.io/k2/cuda.html + pip install kaldifeat==1.25.3.dev20231221+cuda11.7.torch2.0.1 -f https://csukuangfj.github.io/kaldifeat/cuda.html + + + # Now we can install k2-fsa/sherpa + pip install k2_sherpa==1.3.dev20230725+cuda11.7.torch2.0.1 -f https://k2-fsa.github.io/sherpa/cuda.html + +Please see :ref:`check_sherpa_installation`. diff --git a/docs/source/sherpa/install/index.rst b/docs/source/sherpa/install/index.rst new file mode 100644 index 000000000..83013711a --- /dev/null +++ b/docs/source/sherpa/install/index.rst @@ -0,0 +1,22 @@ +.. _sherpa_installation: + +Installation +============ + +.. toctree:: + :maxdepth: 3 + + ./from_wheels.rst + ./from_source.rst + ./check_your_installation.rst + +We suggest that you install ``k2-fsa/sherpa`` by following +:ref:`install_sherpa_from_pre_compiled_wheels`. + +Where to get help +----------------- + +If you have any issues about the installation, please create an issue +at the following address: + + ``_ diff --git a/docs/source/sherpa/pretrained_models/index.rst b/docs/source/sherpa/pretrained_models/index.rst new file mode 100644 index 000000000..0e2643138 --- /dev/null +++ b/docs/source/sherpa/pretrained_models/index.rst @@ -0,0 +1,60 @@ +.. _k2_fsa_sherpa_pretrained_models: + +Pre-trained models +================== + +Two kinds of end-to-end (E2E) models are supported by `k2-fsa/sherpa`_: + +- CTC +- Transducer + +.. hint:: + + For transducer-based models, we only support stateless transducers. + To the best of our knowledge, only `icefall`_ supports that. In other words, + only transducer models from `icefall`_ are currently supported. + + For CTC-based models, we support any type of models trained using CTC loss + as long as you can export the model via torchscript. Models from the following + frameworks are currently supported: `icefall`_, `WeNet`_, and `torchaudio`_ (Wav2Vec 2.0). + If you have a CTC model and want it to be supported in `k2-fsa/sherpa`_, please + create an issue at ``_. + +.. hint:: + + You can try the pre-trained models in your browser without installing + anything. See ``_. + + +This page lists all available pre-trained models that you can download. + +.. hint:: + + We provide pre-trained models for the following languages: + + - Arabic + - Chinese + - English + - German + - Tibetan + + +.. hint:: + + We provide a colab notebook + |Sherpa offline recognition python api colab notebook| + for you to try offline recognition step by step. + + It shows how to install sherpa and use it as offline recognizer, + which supports the models from icefall, the `WeNet`_ framework and torchaudio. + +.. |Sherpa offline recognition python api colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://github.com/k2-fsa/colab/blob/master/sherpa/sherpa_offline_recognition_python_api_demo.ipynb + +.. toctree:: + :maxdepth: 5 + :caption: Pretrained models + + offline_ctc/index + offline_transducer + online_transducer diff --git a/docs/source/sherpa/pretrained_models/offline_ctc/icefall.rst b/docs/source/sherpa/pretrained_models/offline_ctc/icefall.rst new file mode 100644 index 000000000..072aaf292 --- /dev/null +++ b/docs/source/sherpa/pretrained_models/offline_ctc/icefall.rst @@ -0,0 +1,215 @@ +icefall +======= + +.. hint:: + + We use the binary ``sherpa-offline`` below for demonstration. + You can replace ``sherpa-offline`` with ``sherpa-offline-websocket-server``. + +In this section, we list all pre-trained CTC models from `icefall`_. + +icefall-asr-gigaspeech-conformer-ctc (English) +---------------------------------------------- + +.. code-block:: bash + + # This model is trained using GigaSpeech + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/wgb14/icefall-asr-gigaspeech-conformer-ctc + cd icefall-asr-gigaspeech-conformer-ctc + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/HLG.pt" + git lfs pull --include "data/lang_bpe_500/tokens.txt" + mkdir test_wavs + cd test_wavs + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1089-134686-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0002.wav + cd .. + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --hlg=./data/lang_bpe_500/HLG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 (English) +---------------------------------------------------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/tokens.txt" + git lfs pull --include "data/lang_bpe_500/HLG.pt" + git lfs pull --include "data/lang_bpe_500/HLG_modified.pt" + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + --hlg=./data/lang_bpe_500/HLG.pt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG (modified) + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + --hlg=./data/lang_bpe_500/HLG_modified.pt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-tedlium3-conformer-ctc2 (English) +--------------------------------------------- + +.. code-block:: bash + + # This model is trained using Tedlium3 + # + # See https://github.com/k2-fsa/icefall/pull/696 + # + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/videodanchik/icefall-asr-tedlium3-conformer-ctc2 + cd icefall-asr-tedlium3-conformer-ctc2 + git lfs pull --include "exp/cpu_jit.pt" + + git lfs pull --include "data/lang_bpe/HLG.pt" + git lfs pull --include "data/lang_bpe/tokens.txt" + + git lfs pull --include "test_wavs/DanBarber_2010-219.wav" + git lfs pull --include "test_wavs/DanielKahneman_2010-157.wav" + git lfs pull --include "test_wavs/RobertGupta_2010U-15.wav" + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/DanBarber_2010-219.wav \ + ./test_wavs/DanielKahneman_2010-157.wav \ + ./test_wavs/RobertGupta_2010U-15.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --hlg=./data/lang_bpe/HLG.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/DanBarber_2010-219.wav \ + ./test_wavs/DanielKahneman_2010-157.wav \ + ./test_wavs/RobertGupta_2010U-15.wav + +icefall_asr_librispeech_conformer_ctc (English) +----------------------------------------------- + +.. code-block:: bash + + # This model is trained using LibriSpeech + # + # See https://github.com/k2-fsa/icefall/pull/13 + # + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall_asr_librispeech_conformer_ctc + cd icefall_asr_librispeech_conformer_ctc + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe/HLG.pt" + + # Decode with H + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + # Decode with HLG + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --hlg=./data/lang_bpe/HLG.pt \ + --tokens=./data/lang_bpe/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. - ``_ + +icefall_asr_aishell_conformer_ctc (Chinese) +------------------------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall_asr_aishell_conformer_ctc + cd icefall_asr_aishell_conformer_ctc + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_char/HLG.pt" + + # Decode with an H graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_waves/BAC009S0764W0121.wav \ + ./test_waves/BAC009S0764W0122.wav \ + ./test_waves/BAC009S0764W0123.wav + + # Decode with an HLG graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + --hlg=./data/lang_char/HLG.pt \ + ./test_waves/BAC009S0764W0121.wav \ + ./test_waves/BAC009S0764W0122.wav \ + ./test_waves/BAC009S0764W0123.wav + + +icefall-asr-mgb2-conformer_ctc-2022-27-06 (Arabic) +-------------------------------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06 + cd icefall-asr-mgb2-conformer_ctc-2022-27-06 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_5000/HLG.pt" + git lfs pull --include "data/lang_bpe_5000/tokens.txt" + + # Decode with an H graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_5000/tokens.txt \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004.wav + + # Decode with an HLG graph + sherpa-offline \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_5000/tokens.txt \ + --hlg=./data/lang_bpe_5000/HLG.pt \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244.wav \ + ./test_wavs/94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004.wav diff --git a/docs/source/sherpa/pretrained_models/offline_ctc/index.rst b/docs/source/sherpa/pretrained_models/offline_ctc/index.rst new file mode 100644 index 000000000..ca2a4f7d7 --- /dev/null +++ b/docs/source/sherpa/pretrained_models/offline_ctc/index.rst @@ -0,0 +1,13 @@ +Offline CTC models +================== + +This sections list pre-trained CTC models from the following frameworks: + +.. toctree:: + :maxdepth: 5 + + icefall + wenet + torchaudio + nemo + diff --git a/docs/source/sherpa/pretrained_models/offline_ctc/nemo.rst b/docs/source/sherpa/pretrained_models/offline_ctc/nemo.rst new file mode 100644 index 000000000..1acb46b9a --- /dev/null +++ b/docs/source/sherpa/pretrained_models/offline_ctc/nemo.rst @@ -0,0 +1,315 @@ +NeMo +==== + +This section lists models from `NeMo`_. + + +sherpa-nemo-ctc-en-citrinet-512 (English) +----------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-citrinet-512 + cd sherpa-nemo-ctc-en-citrinet-512 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 142M Mar 9 21:23 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-zh-citrinet-512 (Chinese) +----------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-zh-citrinet-512 + cd sherpa-nemo-ctc-zh-citrinet-512 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=true \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 153M Mar 10 15:07 model.pt + +.. hint:: + + Since the vocabulary size of this model is very large, i.e, 5207, we use + ``--modified=true`` to use a + `modified CTC topology `_ + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-zh-citrinet-1024-gamma-0-25 (Chinese) +----------------------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-zh-citrinet-1024-gamma-0-25 + cd sherpa-nemo-ctc-zh-citrinet-1024-gamma-0-25 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=true \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 557M Mar 10 16:29 model.pt + +.. hint:: + + Since the vocabulary size of this model is very large, i.e, 5207, we use + ``--modified=true`` to use a + `modified CTC topology `_ + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-de-citrinet-1024 (German) +----------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-de-citrinet-1024 + cd sherpa-nemo-ctc-de-citrinet-1024 + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 kuangfangjun root 541M Mar 10 16:55 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + + +sherpa-nemo-ctc-en-conformer-small (English) +-------------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-small + cd sherpa-nemo-ctc-en-conformer-small + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 82M Mar 10 19:55 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-en-conformer-medium (English) +--------------------------------------------- + +This model is converted from + + ``_ + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-medium + cd sherpa-nemo-ctc-en-conformer-medium + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 152M Mar 10 20:26 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-en-conformer-large (English) +-------------------------------------------- + +This model is converted from + + ``_ + +.. hint:: + + The vocabulary size is 129 + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-large + cd sherpa-nemo-ctc-en-conformer-large + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 508M Mar 10 20:44 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +sherpa-nemo-ctc-de-conformer-large (German) +------------------------------------------- + +This model is converted from + + ``_ + +.. hint:: + + The vocabulary size is 129 + +.. code-block:: + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-de-conformer-large + cd sherpa-nemo-ctc-de-conformer-large + git lfs pull --include "model.pt" + + sherpa-offline \ + --nn-model=./model.pt \ + --tokens=./tokens.txt \ + --use-gpu=false \ + --modified=false \ + --nemo-normalize=per_feature \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav + +.. code-block:: bash + + ls -lh model.pt + -rw-r--r-- 1 fangjun staff 508M Mar 10 21:34 model.pt + +.. caution:: + + It is of paramount importance to specify ``--nemo-normalize=per_feature``. + +How to convert NeMo models to sherpa +------------------------------------ + +This section describes how to export `NeMo`_ pre-trained CTC models to `sherpa`_. + +You can find a list of pre-trained models from `NeMo`_ by visiting: + + ``_. + +Let us take ``stt_en_conformer_ctc_small`` as an example. + +You can use the following code to obtain ``model.pt`` and ``tokens.txt``: + +.. code-block:: bash + + import nemo.collections.asr as nemo_asr + m = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_conformer_ctc_small') + m.export("model.pt") + + with open('tokens.txt', 'w', encoding='utf-8') as f: + f.write(" 0\n") + for i, s in enumerate(m.decoder.vocabulary): + f.write(f"{s} {i+1}\n") + +One thing to note is that the blank token has the largest token ID in ``NeMo``. +However, it is always ``0`` in `sherpa`_. During network computation, we shift +the last column of the ``log_prob`` tensor to the first column so that +it matches the convention about using 0 for the blank in `sherpa`_. + +You can find the exported ``model.pt`` and ``tokens.txt`` by visiting + + ``_ diff --git a/docs/source/sherpa/pretrained_models/offline_ctc/torchaudio.rst b/docs/source/sherpa/pretrained_models/offline_ctc/torchaudio.rst new file mode 100644 index 000000000..65e7381f1 --- /dev/null +++ b/docs/source/sherpa/pretrained_models/offline_ctc/torchaudio.rst @@ -0,0 +1,42 @@ +torchaudio +========== + +This section lists models from `torchaudio`_. + + +wav2vec2_asr_base (English) +--------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio + cd wav2vec2.0-torchaudio + + # Note: There are other kinds of models fine-tuned with different + # amount of data. We use a model that is fine-tuned with 10 minutes of data. + + git lfs pull --include "wav2vec2_asr_base_10m.pt" + + sherpa-offline \ + --nn-model=wav2vec2_asr_base_10m.pt \ + --tokens=tokens.txt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +voxpopuli_asr_base (German) +--------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio + cd wav2vec2.0-torchaudio + git lfs pull --include "voxpopuli_asr_base_10k_de.pt" + + sherpa-offline \ + --nn-model=voxpopuli_asr_base_10k_de.pt \ + --tokens=tokens-de.txt \ + --use-gpu=false \ + ./test_wavs/20120315-0900-PLENARY-14-de_20120315.wav \ + ./test_wavs/20170517-0900-PLENARY-16-de_20170517.wav diff --git a/docs/source/sherpa/pretrained_models/offline_ctc/wenet.rst b/docs/source/sherpa/pretrained_models/offline_ctc/wenet.rst new file mode 100644 index 000000000..ccf0bcb6a --- /dev/null +++ b/docs/source/sherpa/pretrained_models/offline_ctc/wenet.rst @@ -0,0 +1,44 @@ +WeNet +===== + +This section lists models from `WeNet`_. + +wenet-english-model (English) +----------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wenet-english-model + cd wenet-english-model + git lfs pull --include "final.zip" + + sherpa-offline \ + --normalize-samples=false \ + --modified=true \ + --nn-model=./final.zip \ + --tokens=./units.txt \ + --use-gpu=false \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +wenet-chinese-model (Chinese) +----------------------------- + +.. code-block:: bash + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wenet-chinese-model + cd wenet-chinese-model + git lfs pull --include "final.zip" + + sherpa-offline \ + --normalize-samples=false \ + --modified=true \ + --nn-model=./final.zip \ + --tokens=./units.txt \ + ./test_wavs/BAC009S0764W0121.wav \ + ./test_wavs/BAC009S0764W0122.wav \ + ./test_wavs/BAC009S0764W0123.wav \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav diff --git a/docs/source/sherpa/pretrained_models/offline_transducer.rst b/docs/source/sherpa/pretrained_models/offline_transducer.rst new file mode 100644 index 000000000..742f89c0f --- /dev/null +++ b/docs/source/sherpa/pretrained_models/offline_transducer.rst @@ -0,0 +1,545 @@ +.. _offline_transducer_pretrained_models: + +Offline transducer models +========================= + +.. hint:: + + We use the binary ``sherpa-offline`` below for demonstration. + You can replace ``sherpa-offline`` with ``sherpa-offline-websocket-server``. + +.. hint:: + + Please visit ``_ + to try the pre-trained models in your browser. You don't need to install + anything. + +icefall +------- + +This sections lists models trained using `icefall`_. + +English +^^^^^^^ + +.. _icefall-asr-librispeech-zipformer-2023-05-15: + +icefall-asr-librispeech-zipformer-2023-05-15 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # normal-scaled model, number of model parameters: 65549011, i.e., 65.55 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15 + cd icefall-asr-librispeech-zipformer-2023-05-15 + + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. _icefall-asr-librispeech-zipformer-small-2023-05-16: + +icefall-asr-librispeech-zipformer-small-2023-05-16 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # small-scaled model, number of model parameters: 23285615, i.e., 23.3 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16 + cd icefall-asr-librispeech-zipformer-small-2023-05-16 + + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + +.. _icefall-asr-librispeech-zipformer-large-2023-05-16: + +icefall-asr-librispeech-zipformer-large-2023-05-16 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # large-scaled model, number of model parameters: 148439574, i.e., 148.4 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16 + cd icefall-asr-librispeech-zipformer-large-2023-05-16 + + git lfs pull --include "exp/jit_script.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/jit_script.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. _icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04: + +icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using GigaSpeech + LibriSpeech + Common Voice 13.0 with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/1010 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 + cd icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 + git lfs pull --include "exp/cpu_jit-epoch-30-avg-4.pt" + cd exp + ln -s cpu_jit-epoch-30-avg-4.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + +.. _icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02: + +icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using GigaSpeech + LibriSpeech with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/728 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 + cd icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 + git lfs pull --include "exp/cpu_jit-torch-1.10.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + cd exp + rm cpu_jit.pt + ln -sv cpu_jit-torch-1.10.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using GigaSpeech + LibriSpeech with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/675 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 + cd icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer + # + # See https://github.com/k2-fsa/icefall/pull/672 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 + cd icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 + git lfs pull --include "exp/cpu_jit-torch-1.10.0.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -s cpu_jit-torch-1.10.0.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + # This model is trained using LibriSpeech + GigaSpeech + # + # See https://github.com/k2-fsa/icefall/pull/363 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + +icefall-asr-gigaspeech-pruned-transducer-stateless2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + # This model is trained using GigaSpeech + # + # See https://github.com/k2-fsa/icefall/pull/318 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2 + cd icefall-asr-gigaspeech-pruned-transducer-stateless2 + git lfs pull --include "exp/cpu_jit-iter-3488000-avg-15.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + + cd ../exp + ln -s cpu_jit-iter-3488000-avg-15.pt cpu_jit.pt + cd .. + + # Since this repo does not provide tokens.txt, we generate it from bpe.model + # by ourselves + /path/to/sherpa/scripts/bpe_model_to_tokens.py ./data/lang_bpe_500/bpe.model > ./data/lang_bpe_500/tokens.txt + + mkdir test_wavs + cd test_wavs + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1089-134686-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0001.wav + wget https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio/resolve/main/test_wavs/1221-135766-0002.wav + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + +Chinese +^^^^^^^ + +icefall_asr_wenetspeech_pruned_transducer_stateless2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This models is trained using WenetSpeech + # + # See https://github.com/k2-fsa/icefall/pull/349 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2 + + cd icefall_asr_wenetspeech_pruned_transducer_stateless2 + git lfs pull --include "exp/cpu_jit_epoch_10_avg_2_torch_1.7.1.pt" + git lfs pull --include "data/lang_char/LG.pt" + cd exp + ln -s cpu_jit_epoch_10_avg_2_torch_1.7.1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + done + + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_char/LG.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + +icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This models is trained using aidatatang_200zh + # + # See https://github.com/k2-fsa/icefall/pull/355 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 + cd icefall_asr_aidatatang-200zh_pruned_transducer_stateless2 + git lfs pull --include "exp/cpu_jit_torch.1.7.1.pt" + + cd exp + ln -sv cpu_jit_torch.1.7.1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/T0055G0036S0002.wav \ + ./test_wavs/T0055G0036S0003.wav \ + ./test_wavs/T0055G0036S0004.wav + done + +icefall-asr-alimeeting-pruned-transducer-stateless7 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This models is trained using alimeeting (https://www.openslr.org/119/) + # + # See https://github.com/k2-fsa/icefall/pull/751 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7 + cd icefall-asr-alimeeting-pruned-transducer-stateless7 + + git lfs pull --include "exp/cpu_jit.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/165.wav \ + ./test_wavs/74.wav \ + ./test_wavs/209.wav + done + +Chinese + English +^^^^^^^^^^^^^^^^^ + +icefall_asr_tal-csasr_pruned_transducer_stateless5 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This models is trained using TAL_CSASR dataset from + # https://ai.100tal.com/dataset + # where each utterance contains both English and Chinese. + # + # See https://github.com/k2-fsa/icefall/pull/428 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5 + cd icefall_asr_tal-csasr_pruned_transducer_stateless5 + git lfs pull --include "exp/cpu_jit.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_132.wav \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_138.wav \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_145.wav \ + ./test_wavs/210_36476_210_8341_1_1533271973_7057520_148.wav + done + +Tibetan +^^^^^^^ + +icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using the XBMU-AMDO31 corpus + # + # See https://github.com/k2-fsa/icefall/pull/706 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 + cd icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02 + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav + +icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using the XBMU-AMDO31 corpus + # + # See https://github.com/k2-fsa/icefall/pull/706 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 + cd icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29 + git lfs pull --include "data/lang_bpe_500/LG.pt" + git lfs pull --include "data/lang_bpe_500/tokens.txt" + git lfs pull --include "exp/cpu_jit-epoch-28-avg-23-torch-1.10.0.pt" + git lfs pull --include "test_wavs/a_0_cacm-A70_31116.wav" + git lfs pull --include "test_wavs/a_0_cacm-A70_31117.wav" + git lfs pull --include "test_wavs/a_0_cacm-A70_31118.wav" + + cd exp + rm cpu_jit.pt + ln -sv cpu_jit-epoch-28-avg-23-torch-1.10.0.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-offline \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav + done + + sherpa-offline \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/a_0_cacm-A70_31116.wav \ + ./test_wavs/a_0_cacm-A70_31117.wav \ + ./test_wavs/a_0_cacm-A70_31118.wav diff --git a/docs/source/sherpa/pretrained_models/online_transducer.rst b/docs/source/sherpa/pretrained_models/online_transducer.rst new file mode 100644 index 000000000..f304d1bc7 --- /dev/null +++ b/docs/source/sherpa/pretrained_models/online_transducer.rst @@ -0,0 +1,359 @@ +.. _online_transducer_pretrained_models: + +Online transducer models +======================== + +.. hint:: + + We use the binary ``sherpa-online`` below for demonstration. + You can replace ``sherpa-online`` with ``sherpa-online-websocket-server`` + and ``sherpa-online-microphone``. + +.. hint:: + + At present, only streaming transducer models from `icefall`_ are supported. + +icefall +------- + +This sections lists models trained using `icefall`_. + + +English +^^^^^^^ + +.. _icefall-asr-librispeech-streaming-zipformer-2023-05-17: + +icefall-asr-librispeech-streaming-zipformer-2023-05-17 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/1058 + # + # normal-scaled model, number of model parameters: 66110931, i.e., 66.11 M + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17 + cd icefall-asr-librispeech-streaming-zipformer-2023-05-17 + + git lfs pull --include "exp/jit_script_chunk_16_left_128.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/jit_script_chunk_16_left_128.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/jit_script_chunk_16_left_128.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +.. _icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29: + +icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with streaming zipformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/787 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + + git lfs pull --include "exp/cpu_jit.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with ConvEmformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/440 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 + + git lfs pull --include "exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -sv cpu-jit-epoch-30-avg-10-torch-1.10.0.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + + ./build/bin/sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with LSTM transducer + # + # See https://github.com/k2-fsa/icefall/pull/558 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 + cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03 + + git lfs pull --include "exp/encoder_jit_trace-iter-468000-avg-16.pt" + git lfs pull --include "exp/decoder_jit_trace-iter-468000-avg-16.pt" + git lfs pull --include "exp/joiner_jit_trace-iter-468000-avg-16.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + + cd exp + ln -sv encoder_jit_trace-iter-468000-avg-16.pt encoder_jit_trace.pt + ln -sv decoder_jit_trace-iter-468000-avg-16.pt decoder_jit_trace.pt + ln -sv joiner_jit_trace-iter-468000-avg-16.pt joiner_jit_trace.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --encoder-model=./exp/encoder_jit_trace.pt \ + --decoder-model=./exp/decoder_jit_trace.pt \ + --joiner-model=./exp/joiner_jit_trace.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + sherpa-online \ + --decoding-method=fast_beam_search \ + --encoder-model=./exp/encoder_jit_trace.pt \ + --decoder-model=./exp/decoder_jit_trace.pt \ + --joiner-model=./exp/joiner_jit_trace.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + +icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with Emformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/390 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 + cd icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01 + + git lfs pull --include "exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -sv cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_wavs/1089-134686-0001.wav \ + ./test_wavs/1221-135766-0001.wav \ + ./test_wavs/1221-135766-0002.wav + + +icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using LibriSpeech with Conformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/440 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 + cd icefall_librispeech_streaming_pruned_transducer_stateless4_20220625 + + git lfs pull --include "exp/cpu_jit-epoch-25-avg-3.pt" + git lfs pull --include "data/lang_bpe_500/LG.pt" + cd exp + ln -sv cpu_jit-epoch-25-avg-3.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_waves/1089-134686-0001.wav \ + ./test_waves/1221-135766-0001.wav \ + ./test_waves/1221-135766-0002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_bpe_500/LG.pt \ + --tokens=./data/lang_bpe_500/tokens.txt \ + ./test_waves/1089-134686-0001.wav \ + ./test_waves/1221-135766-0001.wav \ + ./test_waves/1221-135766-0002.wav + +Chinese +^^^^^^^ + +icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # This model is trained using WenetSpeech with Conformer transducer + # + # See https://github.com/k2-fsa/icefall/pull/447 + # + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + cd icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + + git lfs pull --include "exp/cpu_jit_epoch_7_avg_1_torch.1.7.1.pt" + git lfs pull --include "data/lang_char/LG.pt" + cd exp + ln -sv cpu_jit_epoch_7_avg_1_torch.1.7.1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + done + + # For fast_beam_search with LG + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=./exp/cpu_jit.pt \ + --lg=./data/lang_char/LG.pt \ + --tokens=./data/lang_char/tokens.txt \ + ./test_wavs/DEV_T0000000000.wav \ + ./test_wavs/DEV_T0000000001.wav \ + ./test_wavs/DEV_T0000000002.wav + +Chinese + English (all-in-one) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pfluo/k2fsa-zipformer-chinese-english-mixed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is a `streaming zipformer model `_ + +.. code-block:: bash + + # This model supports both Chinese and English + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pfluo/k2fsa-zipformer-chinese-english-mixed + cd k2fsa-zipformer-chinese-english-mixed + git lfs pull --include "exp/cpu_jit.pt" + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char_bpe/tokens.txt \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav \ + ./test_wavs/3.wav \ + ./test_wavs/4.wav + done + +icefall-asr-conv-emformer-transducer-stateless2-zh +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is a `ConvEmformer model `_ + +.. code-block:: bash + + # This model supports both Chinese and English + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh + cd icefall-asr-conv-emformer-transducer-stateless2-zh + git lfs pull --include "exp/cpu_jit-epoch-11-avg-1.pt" + cd exp + ln -sv cpu_jit-epoch-11-avg-1.pt cpu_jit.pt + cd .. + + for m in greedy_search modified_beam_search fast_beam_search; do + sherpa-online \ + --decoding-method=$m \ + --nn-model=./exp/cpu_jit.pt \ + --tokens=./data/lang_char_bpe/tokens.txt \ + ./test_wavs/0.wav \ + ./test_wavs/1.wav \ + ./test_wavs/2.wav \ + ./test_wavs/3.wav \ + ./test_wavs/4.wav + done diff --git a/docs/source/social-groups.rst b/docs/source/social-groups.rst new file mode 100644 index 000000000..9aca7c06f --- /dev/null +++ b/docs/source/social-groups.rst @@ -0,0 +1,44 @@ +Social groups +============= + +WeChat +------ + + +If you have a `WeChat `_ account, you can scan +the following QR code to join the WeChat group of next-gen Kaldi to get +help. + +.. image:: pic/wechat-group-for-next-gen-kaldi.jpg + :width: 200 + :align: center + :alt: WeChat group of next-gen Kaldi + +QQ +-- + +The QQ group is also given below: + +.. image:: pic/qq-group-for-next-gen-kaldi.jpg + :width: 200 + :align: center + :alt: QQ group of next-gen Kaldi + +.. hint:: + + The QQ group number is ``744602236``. + +Bilibili (B 站) +--------------- + +Please visit ``_ +for various demo videos of Next-gen Kaldi. + +YouTube +------- + +To get the latest news of `next-gen Kaldi `_, please subscribe +the following YouTube channel by `Nadira Povey `_: + + ``_ + diff --git a/docs/source/triton/client/index.rst b/docs/source/triton/client/index.rst new file mode 100755 index 000000000..5e888681b --- /dev/null +++ b/docs/source/triton/client/index.rst @@ -0,0 +1,50 @@ +Triton-client +============== + +Send requests using client +-------------------------------------------------------------------- +In the docker container, run the client script to do ASR inference: + +.. code-block:: bash + + cd sherpa/triton/client + # Test one audio using offline ASR + python3 client.py --audio_file=./test_wavs/1089-134686-0001.wav --url=localhost:8001 + + # Test one audio using streaming ASR + python3 client.py --audio_file=./test_wavs/1089-134686-0001.wav --url=localhost:8001 --streaming + + +The above command sends a single audio ``1089-134686-0001.wav`` to the server and get the result. ``--url`` option specifies the IP and port of the server, +in this example, we set the server and client on the same machine, therefore IP is ``localhost``, and we use port ``8001`` since it is the default port for gRPC in Triton. + +You can also test a bunch of audios together with the client. Just specify the path of ``wav.scp`` with ``--wavscp`` option, +set the path of test set directory with ``--data_dir`` option, and set the path of ground-truth transcript file with ``--trans`` option, +the client will infer all the audios in test set and calculate the WER upon the test set. + +Decode manifests +------------------ +You could also decode a whole dataset to benchmark metrics e.g. RTF, WER. + +.. caution:: + Decode manifests in simulation streaming mode would be supported in the future. + +.. code-block:: bash + + cd sherpa/triton/client + + # For aishell manifests: + git lfs install + git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests + sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell + tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ + # dev set: ./aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz + # test set: ./aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz + + python3 decode_manifest_triton.py \ + --server-addr localhost \ + --num-tasks 300 \ + --log-interval 20 \ + --model-name transducer \ + --manifest-filename ./aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz \ + --compute-cer diff --git a/docs/source/triton/installation/index.rst b/docs/source/triton/installation/index.rst new file mode 100755 index 000000000..e41f3e75b --- /dev/null +++ b/docs/source/triton/installation/index.rst @@ -0,0 +1,30 @@ +Installation +============ + +We prepare a dockerfile based on official triton docker containers. The customized dockerfile intergrates `Triton-server`_, `Triton-client`_ and +sherpa-related requirements into a single image. You need to install `Docker`_ first before starting installation. + +.. hint:: + + For your production environment, you could build triton manually to reduce the size of container. + +Build Triton Image +------------------------------------- + +.. code-block:: bash + + git clone https://github.com/k2-fsa/sherpa + cd sherpa/triton + docker build . -f Dockerfile/Dockerfile.server -t sherpa_triton_server:latest + +.. note:: + It may take a lot of time since we build k2 from source. If you only need to use greedy search scorer, you could comment k2-related lines. + +Launch a inference container +----------------------------- + +.. code-block:: bash + + docker run --gpus all --name sherpa_server --net host --shm-size=1g -it sherpa_triton_server:latest + +Now, you should enter into the container successfully. \ No newline at end of file diff --git a/docs/source/triton/overview.rst b/docs/source/triton/overview.rst new file mode 100644 index 000000000..492b6d33a --- /dev/null +++ b/docs/source/triton/overview.rst @@ -0,0 +1,44 @@ + +.. _triton_overview: + +Triton +====== + +Nvidia `Triton`_ Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs. + + +The following content describes how to deploy ASR models trained by `icefall`_ using Triton. + + +.. toctree:: + :maxdepth: 2 + :caption: Environment Preparetion + + ./installation/index + + +.. toctree:: + :maxdepth: 2 + :caption: Triton Server + + ./server/index + + +.. toctree:: + :maxdepth: 2 + :caption: Triton Client + + ./client/index + + +.. toctree:: + :maxdepth: 2 + :caption: Benchmark with Perf Analyzer + + ./perf/index + +.. toctree:: + :maxdepth: 2 + :caption: TensorRT acceleration + + ./trt/index diff --git a/docs/source/triton/perf/index.rst b/docs/source/triton/perf/index.rst new file mode 100755 index 000000000..f09c57ee5 --- /dev/null +++ b/docs/source/triton/perf/index.rst @@ -0,0 +1,55 @@ +Perf Analyzer +============= + +We can use perf_analyzer provided by Triton to test the performance of the service. + +Generate Input Data from Audio Files +------------------------------------- + +For offline ASR server: + +.. code-block:: bash + + cd sherpa/triton/client + # en + python3 generate_perf_input.py --audio_file=test_wavs/1089-134686-0001.wav + # zh + python3 generate_perf_input.py --audio_file=test_wavs/zh/mid.wav + + +It will generate a ``offline_input.json`` file ``sherpa/triton/client``. + +For streaming ASR server, you need to add a ``--streaming`` option: + +.. code-block:: bash + + python3 generate_perf_input.py --audio_file=test_wavs/1089-134686-0001.wav --streaming + +A ``online_input.json`` file would be generated. + +Test Throughput using Perf Analyzer +------------------------------------ + +.. code-block:: bash + + # Offline ASR Test with grpc + perf_analyzer -m transducer -b 1 -a -p 20000 --concurrency-range 100:200:50 -i gRPC --input-data=offline_input.json -u localhost:8001 + + # Streaming ASR Test with grpc + perf_analyzer -m transducer -b 1 -a -p 20000 --concurrency-range 100:200:50 -i gRPC --input-data=online_input.json -u localhost:8001 --streaming + + +You could save the below results with a ``-f log.txt`` option. + ++--------------+--------------------+--------------+---------------------------+---------------+-----------------------+-----------------------+------------------------+--------------+--------------+--------------+--------------+--------------+ +| Concurrency | Inferences/Second | Client Send | Network+Server Send/Recv | Server Queue | Server Compute Input | Server Compute Infer | Server Compute Output | Client Recv | p50 latency | p90 latency | p95 latency | p99 latency | ++==============+====================+==============+===========================+===============+=======================+=======================+========================+==============+==============+==============+==============+==============+ +| 300 | 226.24 | 109 | 230434 | 1 | 9314 | 1068792 | 14512 | 1 | 1254206 | 1616224 | 1958246 | 3551406 | ++--------------+--------------------+--------------+---------------------------+---------------+-----------------------+-----------------------+------------------------+--------------+--------------+--------------+--------------+--------------+ + + +.. note:: + + Please refer to + ``_ + for advanced usuage. diff --git a/docs/source/triton/server/index.rst b/docs/source/triton/server/index.rst new file mode 100755 index 000000000..47b082823 --- /dev/null +++ b/docs/source/triton/server/index.rst @@ -0,0 +1,84 @@ +Triton-server +============= +This page gives serveral examples to deploy streaming and offline ASR pretrained models with Triton server. + +Deploy streaming ASR models with Onnx +------------------------------------- + +First, we need to export pretrained models with Onnx. + +.. code-block:: bash + + export SHERPA_SRC=./sherpa + export ICEFALL_SRC=/workspace/icefall + # copy essentials + cp $SHERPA_SRC/triton/scripts/*onnx*.py $ICEFALL_DIR/egs/wenetspeech/ASR/pruned_stateless_transducer5/ + cd $ICEFALL_SRC/egs/wenetspeech/ASR/ + # download pretrained models + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + cd ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + git lfs pull --include "exp/pretrained_epoch_7_avg_1.pt" + cd - + # export to onnx fp16 + ln -s ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp/pretrained_epoch_7_avg_1.pt ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp/epoch-999.pt + ./pruned_transducer_stateless5/export_onnx.py \ + --exp-dir ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp \ + --tokenizer-file ./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/data/lang_char \ + --epoch 999 \ + --avg 1 \ + --streaming-model 1 \ + --causal-convolution 1 \ + --onnx 1 \ + --left-context 64 \ + --right-context 4 \ + --fp16 + +.. note:: + + For Chinese models, ``--tokenizer-file`` points to ``/data/lang_char``. While for English models, it points to ``/data/lang_bpe_500/bpe.model`` file. + +Then, in the docker container, you could start the service with: + +.. code-block:: bash + + cd sherpa/triton/ + bash scripts/start_streaming_server.sh + + + +Deploy offline ASR models with torchscript +------------------------------------------ +.. caution:: + Currently, we only support FP32 offline ASR inference for torchscript backend. Streaming ASR and FP16 inference are not supported. + +First, we need to export pretrained models using jit. + +.. code-block:: bash + + export SHERPA_SRC=./sherpa + export ICEFALL_SRC=/workspace/icefall + # Download pretrained models + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29 $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3/ + cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29 + git lfs pull --include "exp/pretrained-epoch-25-avg-7.pt" + # export them to three jit models: encoder_jit.pt, decoder_jit.pt, joiner_jit.pt + cp $SHERPA_SRC/triton/scripts/conformer_triton.py $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3/ + cp $SHERPA_SRC/triton/scripts/export_jit.py $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3/ + cd $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3 + python3 export_jit.py \ + --pretrained-model $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29 \ + --output-dir --bpe-model + # copy bpe file to , later we would mount to the triton docker container + cp + +.. note:: + + If you export models outside the docker container, you could mount the exported ```` with + ``-v :`` when lauching the container. + +Then, in the docker container, you could start the service with: + +.. code-block:: bash + + cd sherpa/triton/ + bash scripts/start_offline_server_jit.sh diff --git a/docs/source/triton/trt/index.rst b/docs/source/triton/trt/index.rst new file mode 100755 index 000000000..c2a4cc478 --- /dev/null +++ b/docs/source/triton/trt/index.rst @@ -0,0 +1,81 @@ +TensorRT acceleration +===================== + +This page shows how to use TensorRT engine to accelerate inference speed for K2 models + +Preparation +----------- + +First of all, you have to install the TensorRT. Here we suggest you to use docker container to run TRT. Just run the following command: + +.. code-block:: bash + + docker run --gpus '"device=0"' -it --rm --net host -v $PWD/:/k2 nvcr.io/nvidia/tensorrt:22.12-py3 + + +You can also see `here `_ to build TRT on your machine. + +.. note:: + + Please pay attention that, the TRT version must have to >= 8.5.3!!! + + +If your TRT version is < 8.5.3, you can download the desired TRT version and then run the following command inside the docker container to use the TRT you just download: + + +.. code-block:: bash + + # inside the container + bash tools/install.sh + + + +Model export +------------ + +You have to prepare the ONNX model by referring +`here `_ to export your models into ONNX format. +Assume you have put your ONNX model in the ``$model_dir`` directory. +Then, just run the command: + +.. code-block:: bash + + bash tools/build.sh $model_dir + cp $model_dir/encoder.trt model_repo_offline_fast_beam_trt/encoder/1 + + +The generated TRT model will be saved into ``$model_dir/encoder.trt``. +We also give an example of ``model_repo`` of TRT model. You can follow the same procedure as described +`here `_ to deploy the pipeline using triton. + + +Benchmark for Conformer TRT encoder vs ONNX +------------------------------------------- + ++-------+------------+-----------------+--------+ +| Model | Batch size | Avg latency(ms) | QPS | ++=======+============+=================+========+ +| ONNX | 1 | 7.44 | 134.48 | ++-------+------------+-----------------+--------+ +| | 8 | 14.92 | 536.09 | ++-------+------------+-----------------+--------+ +| | 16 | 22.84 | 700.67 | ++-------+------------+-----------------+--------+ +| | 32 | 41.62 | 768.84 | ++-------+------------+-----------------+--------+ +| | 64 | 80.48 | 795.27 | ++-------+------------+-----------------+--------+ +| | 128 | 171.97 | 744.32 | ++-------+------------+-----------------+--------+ +| TRT | 1 | 5.21834 | 193.93 | ++-------+------------+-----------------+--------+ +| | 8 | 11.7826 | 703.49 | ++-------+------------+-----------------+--------+ +| | 16 | 20.4444 | 815.79 | ++-------+------------+-----------------+--------+ +| | 32 | 37.583 | 893.56 | ++-------+------------+-----------------+--------+ +| | 64 | 69.8312 | 965.40 | ++-------+------------+-----------------+--------+ +| | 128 | 139.702 | 964.57 | ++-------+------------+-----------------+--------+ diff --git a/get_version.py b/get_version.py new file mode 100755 index 000000000..15bd8ae1f --- /dev/null +++ b/get_version.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +import datetime +import os +import platform +import re +import shutil + +import torch + + +def is_macos(): + return platform.system() == "Darwin" + + +def is_windows(): + return platform.system() == "Windows" + + +def with_cuda(): + if shutil.which("nvcc") is None: + return False + + if is_macos(): + return False + + return True + + +def get_pytorch_version(): + # if it is 1.7.1+cuda101, then strip +cuda101 + return torch.__version__.split("+")[0] + + +def get_cuda_version(): + from torch.utils import collect_env + + running_cuda_version = collect_env.get_running_cuda_version(collect_env.run) + cuda_version = torch.version.cuda + if running_cuda_version is not None and cuda_version is not None: + assert cuda_version in running_cuda_version, ( + f"PyTorch is built with CUDA version: {cuda_version}.\n" + f"The current running CUDA version is: {running_cuda_version}" + ) + return cuda_version + + +def is_for_pypi(): + ans = os.environ.get("KALDIFEAT_IS_FOR_PYPI", None) + return ans is not None + + +def is_stable(): + ans = os.environ.get("KALDIFEAT_IS_STABLE", None) + return ans is not None + + +def is_for_conda(): + ans = os.environ.get("KALDIFEAT_IS_FOR_CONDA", None) + return ans is not None + + +def get_package_version(): + # Set a default CUDA version here so that `pip install kaldifeat` + # uses the default CUDA version. + # + default_cuda_version = "10.1" # CUDA 10.1 + + if with_cuda(): + cuda_version = get_cuda_version() + if is_for_pypi() and default_cuda_version == cuda_version: + cuda_version = "" + pytorch_version = "" + local_version = "" + else: + cuda_version = f"+cuda{cuda_version}" + pytorch_version = get_pytorch_version() + local_version = f"{cuda_version}.torch{pytorch_version}" + else: + pytorch_version = get_pytorch_version() + local_version = f"+cpu.torch{pytorch_version}" + + if is_for_conda(): + local_version = "" + + if is_for_pypi() and is_macos(): + local_version = "" + + with open("CMakeLists.txt") as f: + content = f.read() + + latest_version = re.search(r"set\(SHERPA_VERSION (.*)\)", content).group(1) + latest_version = latest_version.strip('"') + + if not is_stable(): + dt = datetime.datetime.utcnow() + package_version = ( + f"{latest_version}.dev{dt.year}{dt.month:02d}{dt.day:02d}" + f"{local_version}" + ) + else: + package_version = f"{latest_version}" + return package_version + + +if __name__ == "__main__": + print(get_package_version()) diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index c953454c3..000000000 --- a/pyproject.toml +++ /dev/null @@ -1,14 +0,0 @@ -[tool.isort] -profile = "black" - -[tool.black] -line-length = 80 -exclude = ''' -/( - \.git - | \.github - | cmake - | triton - | build -)/ -''' diff --git a/python-api-examples/compute-speaker-simiarlity.py b/python-api-examples/compute-speaker-simiarlity.py new file mode 100755 index 000000000..e73fc0f61 --- /dev/null +++ b/python-api-examples/compute-speaker-simiarlity.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Xiaomi Corporation + +""" +Please download model files from +https://github.com/k2-fsa/sherpa/releases/ + +E.g. + +wget https://github.com/k2-fsa/sherpa/releases/download/speaker-recognition-models/3d_speaker-speech_eres2netv2_sv_zh-cn_16k-common.pt + +Please download test files from +https://github.com/csukuangfj/sr-data/tree/main/test/3d-speaker + +""" + +import time +from typing import Tuple +import torch + +import librosa +import numpy as np +import soundfile as sf + +import sherpa + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def create_extractor(): + config = sherpa.SpeakerEmbeddingExtractorConfig( + model="./3d_speaker-speech_eres2netv2_sv_zh-cn_16k-common.pt", + ) + print(config) + return sherpa.SpeakerEmbeddingExtractor(config) + + +def main(): + extractor = create_extractor() + + file1 = "./speaker1_a_cn_16k.wav" + file2 = "./speaker1_b_cn_16k.wav" + file3 = "./speaker2_a_cn_16k.wav" + + samples1, sample_rate1 = load_audio(file1) + if sample_rate1 != 16000: + samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000) + sample_rate1 = 16000 + + samples2, sample_rate2 = load_audio(file2) + if sample_rate2 != 16000: + samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000) + sample_rate2 = 16000 + + samples3, sample_rate3 = load_audio(file3) + if sample_rate3 != 16000: + samples3 = librosa.resample(samples3, orig_sr=sample_rate3, target_sr=16000) + sample_rate3 = 16000 + + start = time.time() + stream1 = extractor.create_stream() + stream2 = extractor.create_stream() + stream3 = extractor.create_stream() + + stream1.accept_waveform(samples1) + stream2.accept_waveform(samples2) + stream3.accept_waveform(samples3) + + embeddings = extractor.compute([stream1, stream2, stream3]) + # embeddings: (batch_size, dim) + + x12 = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0) + x13 = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[2], dim=0) + x23 = torch.nn.functional.cosine_similarity(embeddings[1], embeddings[2], dim=0) + + end = time.time() + + elapsed_seconds = end - start + + print(x12, x13, x23) + + audio_duration = ( + len(samples1) / sample_rate1 + + len(samples2) / sample_rate2 + + len(samples3) / sample_rate3 + ) + real_time_factor = elapsed_seconds / audio_duration + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +if __name__ == "__main__": + torch._C._jit_set_profiling_executor(False) + torch._C._jit_set_profiling_mode(False) + torch._C._set_graph_executor_optimize(False) + + torch.set_num_threads(1) + torch.set_num_interop_threads(1) + main() diff --git a/python-api-examples/sense-voice.py b/python-api-examples/sense-voice.py new file mode 100755 index 000000000..2d305627e --- /dev/null +++ b/python-api-examples/sense-voice.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Xiaomi Corporation + +""" +Please download sense voice model from +https://github.com/k2-fsa/sherpa/releases/tag/asr-models + +E.g., +wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 +""" +import time +from typing import Tuple + +import librosa +import numpy as np +import sherpa +import soundfile as sf + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def create_recognizer(): + config = sherpa.OfflineRecognizerConfig( + model=sherpa.OfflineModelConfig( + sense_voice=sherpa.OfflineSenseVoiceModelConfig( + model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt", + use_itn=True, + language="auto", + ), + debug=False, + ), + tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt", + use_gpu=False, + ) + + # You have to call config.Validate() to make it work! + config.validate() + return sherpa.OfflineRecognizer(config) + + +def test_decoding_single_file(recognizer): + print("----------Test a single file----------") + test_wave_file = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav" + + samples, sample_rate = load_audio(test_wave_file) + if sample_rate != 16000: + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + start = time.time() + + stream = recognizer.create_stream() + stream.accept_waveform(samples) + recognizer.decode_stream(stream) + text = stream.result.text + + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(samples) / sample_rate + real_time_factor = elapsed_seconds / audio_duration + + print(text) + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +def test_decoding_multipl_files(recognizer): + print("----------Test decoding multiple files----------") + test_wave_file1 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav" + test_wave_file2 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/en.wav" + + samples1, sample_rate1 = load_audio(test_wave_file1) + if sample_rate1 != 16000: + samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000) + sample_rate1 = 16000 + + samples2, sample_rate2 = load_audio(test_wave_file2) + if sample_rate2 != 16000: + samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000) + sample_rate2 = 16000 + + start = time.time() + stream1 = recognizer.create_stream() + stream1.accept_waveform(samples1) + + stream2 = recognizer.create_stream() + stream2.accept_waveform(samples2) + + recognizer.decode_streams([stream1, stream2]) + text1 = stream1.result.text + text2 = stream2.result.text + + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2 + real_time_factor = elapsed_seconds / audio_duration + + print(f"{test_wave_file1}\n {text1}") + print() + print(f"{test_wave_file2}\n {text2}") + + print() + + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +def main(): + recognizer = create_recognizer() + test_decoding_single_file(recognizer) + test_decoding_multipl_files(recognizer) + + +if __name__ == "__main__": + main() diff --git a/python-api-examples/vad-with-sense-voice.py b/python-api-examples/vad-with-sense-voice.py new file mode 100755 index 000000000..336a2b916 --- /dev/null +++ b/python-api-examples/vad-with-sense-voice.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Xiaomi Corporation + +""" +Please download sense voice model from +https://github.com/k2-fsa/sherpa/releases/tag/asr-models + +E.g., +wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 + + +Please download VAD models from +https://github.com/k2-fsa/sherpa/releases/tag/vad-models + +E.g., +wget https://github.com/k2-fsa/sherpa/releases/download/vad-models/silero-vad-v4.pt +""" +from typing import Tuple + +import librosa +import numpy as np +import sherpa +import soundfile as sf +import torch + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def create_recognizer(): + config = sherpa.OfflineRecognizerConfig( + model=sherpa.OfflineModelConfig( + sense_voice=sherpa.OfflineSenseVoiceModelConfig( + model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt", + use_itn=True, + language="auto", + ), + debug=False, + ), + tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt", + use_gpu=False, + ) + + # You have to call config.Validate() to make it work! + config.validate() + return sherpa.OfflineRecognizer(config) + + +def create_vad(): + config = sherpa.VoiceActivityDetectorConfig( + segment_size=20, + model=sherpa.VadModelConfig( + silero_vad=sherpa.SileroVadModelConfig( + model="./silero-vad-v4.pt", + threshold=0.5, + min_speech_duration=0.25, + min_silence_duration=0.5, + ), + sample_rate=16000, + ), + ) + return sherpa.VoiceActivityDetector(config) + + +def main(): + vad = create_vad() + recognizer = create_recognizer() + + test_wave_file = "./lei-jun-test.wav" + + samples, sample_rate = load_audio(test_wave_file) + if sample_rate != 16000: + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + segments = vad.process(torch.from_numpy(samples)) + for s in segments: + start_sample = int(s.start * sample_rate) + end_sample = int(s.end * sample_rate) + stream = recognizer.create_stream() + stream.accept_waveform(samples[start_sample:end_sample]) + recognizer.decode_stream(stream) + text = stream.result.text + + print(f"{s.start:.3f} -- {s.end:.3f} {text}") + + +if __name__ == "__main__": + main() diff --git a/python-api-examples/whisper.py b/python-api-examples/whisper.py new file mode 100755 index 000000000..607a69673 --- /dev/null +++ b/python-api-examples/whisper.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Xiaomi Corporation + +""" +Please download a whisper model from +https://github.com/k2-fsa/sherpa/releases/tag/asr-models + +E.g., +wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-whisper-tiny.en.tar.bz2 +""" +import time +from typing import Tuple + +import librosa +import numpy as np +import sherpa +import soundfile as sf + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def create_recognizer(): + config = sherpa.OfflineRecognizerConfig( + model=sherpa.OfflineModelConfig( + whisper=sherpa.OfflineWhisperModelConfig( + model="./sherpa-whisper-tiny.en/model.pt", + ), + debug=False, + ), + tokens="./sherpa-whisper-tiny.en/tokens.txt", + use_gpu=False, + ) + + # You have to call config.Validate() to make it work! + config.validate() + return sherpa.OfflineRecognizer(config) + + +def test_decoding_single_file(recognizer): + print("----------Test a single file----------") + test_wave_file = "./sherpa-whisper-tiny.en/test_wavs/0.wav" + + samples, sample_rate = load_audio(test_wave_file) + if sample_rate != 16000: + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + start = time.time() + + stream = recognizer.create_stream() + stream.accept_waveform(samples) + recognizer.decode_stream(stream) + text = stream.result.text + + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(samples) / sample_rate + real_time_factor = elapsed_seconds / audio_duration + + print(text) + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +def test_decoding_multipl_files(recognizer): + print("----------Test decoding multiple files----------") + test_wave_file1 = "./sherpa-whisper-tiny.en/test_wavs/0.wav" + test_wave_file2 = "./sherpa-whisper-tiny.en/test_wavs/1.wav" + + samples1, sample_rate1 = load_audio(test_wave_file1) + if sample_rate1 != 16000: + samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000) + sample_rate1 = 16000 + + samples2, sample_rate2 = load_audio(test_wave_file2) + if sample_rate2 != 16000: + samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000) + sample_rate2 = 16000 + + start = time.time() + stream1 = recognizer.create_stream() + stream1.accept_waveform(samples1) + + stream2 = recognizer.create_stream() + stream2.accept_waveform(samples2) + + recognizer.decode_streams([stream1, stream2]) + text1 = stream1.result.text + text2 = stream2.result.text + + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2 + real_time_factor = elapsed_seconds / audio_duration + + print(f"{test_wave_file1}\n {text1}") + print() + print(f"{test_wave_file2}\n {text2}") + + print() + + print(f"Elapsed seconds: {elapsed_seconds:.3f}") + print(f"Audio duration in seconds: {audio_duration:.3f}") + print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +def main(): + recognizer = create_recognizer() + test_decoding_single_file(recognizer) + test_decoding_multipl_files(recognizer) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 534518f08..bffdeae4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,3 @@ websockets kaldifeat sentencepiece>=0.1.96 numpy -kaldi_native_io diff --git a/scripts/3d-speaker/README.md b/scripts/3d-speaker/README.md new file mode 100644 index 000000000..e084fbf1b --- /dev/null +++ b/scripts/3d-speaker/README.md @@ -0,0 +1,104 @@ +# Introduction + +This folder contains scripts for exporting models from +https://github.com/modelscope/3D-Speaker + + +Some of the exported models are listed below: + +``` +-rw-r--r-- 1 runner staff 29M Jan 10 02:50 3d_speaker-speech_campplus_sv_en_voxceleb_16k.pt +-rw-r--r-- 1 runner staff 28M Jan 10 02:50 3d_speaker-speech_campplus_sv_zh-cn_16k-common.pt +-rw-r--r-- 1 runner staff 28M Jan 10 02:50 3d_speaker-speech_campplus_sv_zh_en_16k-common_advanced.pt +-rw-r--r-- 1 runner staff 80M Jan 10 02:52 3d_speaker-speech_ecapa-tdnn_sv_en_voxceleb_16k.pt +-rw-r--r-- 1 runner staff 80M Jan 10 02:52 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_3dspeaker_16k.pt +-rw-r--r-- 1 runner staff 80M Jan 10 02:51 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_cnceleb_16k.pt +-rw-r--r-- 1 runner staff 38M Jan 10 02:50 3d_speaker-speech_eres2net_base_200k_sv_zh-cn_16k-common.pt +-rw-r--r-- 1 runner staff 38M Jan 10 02:51 3d_speaker-speech_eres2net_base_sv_zh-cn_3dspeaker_16k.pt +-rw-r--r-- 1 runner staff 112M Jan 10 02:51 3d_speaker-speech_eres2net_large_sv_zh-cn_3dspeaker_16k.pt +-rw-r--r-- 1 runner staff 26M Jan 10 02:51 3d_speaker-speech_eres2net_sv_en_voxceleb_16k.pt +-rw-r--r-- 1 runner staff 212M Jan 10 02:50 3d_speaker-speech_eres2net_sv_zh-cn_16k-common.pt +-rw-r--r-- 1 runner staff 69M Jan 10 02:50 3d_speaker-speech_eres2netv2_sv_zh-cn_16k-common.pt +-rw-r--r-- 1 runner staff 206M Jan 10 02:50 3d_speaker-speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common.pt +``` + +``` +./test.py --model 3d_speaker-speech_campplus_sv_en_voxceleb_16k.pt +----------testing 3d_speaker-speech_campplus_sv_en_voxceleb_16k.pt---------- +embedding shape torch.Size([512]) +tensor(0.6211) tensor(0.0356) tensor(0.0948) +----------testing 3d_speaker-speech_campplus_sv_en_voxceleb_16k.pt done---------- + + +./test.py --model 3d_speaker-speech_campplus_sv_zh-cn_16k-common.pt +----------testing 3d_speaker-speech_campplus_sv_zh-cn_16k-common.pt---------- +embedding shape torch.Size([192]) +tensor(0.6936) tensor(-0.0842) tensor(0.0072) +----------testing 3d_speaker-speech_campplus_sv_zh-cn_16k-common.pt done---------- + +./test.py --model 3d_speaker-speech_campplus_sv_zh_en_16k-common_advanced.pt +----------testing 3d_speaker-speech_campplus_sv_zh_en_16k-common_advanced.pt---------- +embedding shape torch.Size([192]) +tensor(0.6668) tensor(0.0670) tensor(0.0569) +----------testing 3d_speaker-speech_campplus_sv_zh_en_16k-common_advanced.pt done---------- + +./test.py --model 3d_speaker-speech_ecapa-tdnn_sv_en_voxceleb_16k.pt +----------testing 3d_speaker-speech_ecapa-tdnn_sv_en_voxceleb_16k.pt---------- +embedding shape torch.Size([192]) +tensor(0.6733) tensor(-0.0007) tensor(0.0611) +----------testing 3d_speaker-speech_ecapa-tdnn_sv_en_voxceleb_16k.pt done---------- + +./test.py --model 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_3dspeaker_16k.pt +----------testing 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_3dspeaker_16k.pt---------- +embedding shape torch.Size([192]) +tensor(0.5880) tensor(0.1363) tensor(0.0885) +----------testing 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_3dspeaker_16k.pt done---------- + +./test.py --model 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_cnceleb_16k.pt +----------testing 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_cnceleb_16k.pt---------- +embedding shape torch.Size([192]) +tensor(0.7074) tensor(0.0289) tensor(0.1022) +----------testing 3d_speaker-speech_ecapa-tdnn_sv_zh-cn_cnceleb_16k.pt done---------- + +./test.py --model 3d_speaker-speech_eres2net_base_200k_sv_zh-cn_16k-common.pt +----------testing 3d_speaker-speech_eres2net_base_200k_sv_zh-cn_16k-common.pt---------- +embedding shape torch.Size([512]) +tensor(0.6675) tensor(0.0066) tensor(0.0576) +----------testing 3d_speaker-speech_eres2net_base_200k_sv_zh-cn_16k-common.pt done---------- + +./test.py --model 3d_speaker-speech_eres2net_base_sv_zh-cn_3dspeaker_16k.pt +----------testing 3d_speaker-speech_eres2net_base_sv_zh-cn_3dspeaker_16k.pt---------- +embedding shape torch.Size([512]) +tensor(0.6411) tensor(0.1044) tensor(0.0209) +----------testing 3d_speaker-speech_eres2net_base_sv_zh-cn_3dspeaker_16k.pt done---------- + +./test.py --model 3d_speaker-speech_eres2net_large_sv_zh-cn_3dspeaker_16k.pt +----------testing 3d_speaker-speech_eres2net_large_sv_zh-cn_3dspeaker_16k.pt---------- +embedding shape torch.Size([512]) +tensor(0.6336) tensor(0.0829) tensor(0.0681) +----------testing 3d_speaker-speech_eres2net_large_sv_zh-cn_3dspeaker_16k.pt done---------- + +./test.py --model 3d_speaker-speech_eres2net_sv_en_voxceleb_16k.pt +----------testing 3d_speaker-speech_eres2net_sv_en_voxceleb_16k.pt---------- +embedding shape torch.Size([192]) +tensor(0.6554) tensor(-0.0092) tensor(0.0551) +----------testing 3d_speaker-speech_eres2net_sv_en_voxceleb_16k.pt done---------- + +./test.py --model 3d_speaker-speech_eres2net_sv_zh-cn_16k-common.pt +----------testing 3d_speaker-speech_eres2net_sv_zh-cn_16k-common.pt---------- +embedding shape torch.Size([192]) +tensor(0.7127) tensor(0.0287) tensor(0.1308) +----------testing 3d_speaker-speech_eres2net_sv_zh-cn_16k-common.pt done---------- + +./test.py --model 3d_speaker-speech_eres2netv2_sv_zh-cn_16k-common.pt +----------testing 3d_speaker-speech_eres2netv2_sv_zh-cn_16k-common.pt---------- +embedding shape torch.Size([192]) +tensor(0.7194) tensor(0.0904) tensor(0.1441) +----------testing 3d_speaker-speech_eres2netv2_sv_zh-cn_16k-common.pt done---------- + +./test.py --model 3d_speaker-speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common.pt +----------testing 3d_speaker-speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common.pt---------- +embedding shape torch.Size([192]) +tensor(0.7625) tensor(-0.0190) tensor(0.1121) +----------testing 3d_speaker-speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common.pt done---------- +``` diff --git a/scripts/3d-speaker/export.py b/scripts/3d-speaker/export.py new file mode 100755 index 000000000..408ec5f72 --- /dev/null +++ b/scripts/3d-speaker/export.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import argparse +import os +import pathlib +import re + +import torch +from modelscope.hub.snapshot_download import snapshot_download +from speakerlab.bin.infer_sv import supports +from speakerlab.utils.builder import dynamic_import + + +def convert(model_id): + local_model_dir = "pretrained" + save_dir = os.path.join(local_model_dir, model_id.split("/")[1]) + save_dir = pathlib.Path(save_dir) + save_dir.mkdir(exist_ok=True, parents=True) + + conf = supports[model_id] + # download models from modelscope according to model_id + cache_dir = snapshot_download( + model_id, + revision=conf["revision"], + ) + cache_dir = pathlib.Path(cache_dir) + + download_files = ["examples", conf["model_pt"]] + for src in cache_dir.glob("*"): + if re.search("|".join(download_files), src.name): + dst = save_dir / src.name + try: + dst.unlink() + except FileNotFoundError: + pass + dst.symlink_to(src) + + pretrained_model = save_dir / conf["model_pt"] + pretrained_state = torch.load(pretrained_model, map_location="cpu") + + model = conf["model"] + embedding_model = dynamic_import(model["obj"])(**model["args"]) + embedding_model.load_state_dict(pretrained_state) + embedding_model.to("cpu") + embedding_model.eval() + + x = torch.randn(1, 345, 80) + + m = torch.jit.trace(embedding_model, x) + + meta_data = { + "model_type": "3d-speaker", + "version": "1", + "model_id": model_id, + } + m.save(f"3d_speaker-{model_id.split('/')[-1]}.pt", _extra_files=meta_data) + print(meta_data) + + +def main(): + for model_id in supports: + print(f"----------{model_id}----------") + convert(model_id) + + +if __name__ == "__main__": + main() diff --git a/scripts/3d-speaker/run.sh b/scripts/3d-speaker/run.sh new file mode 100755 index 000000000..b02dadabe --- /dev/null +++ b/scripts/3d-speaker/run.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +if [ ! -f ./speaker1_a_cn_16k.wav ]; then + wget https://www.modelscope.cn/models/iic/speech_eres2netv2_sv_zh-cn_16k-common/resolve/master/examples/speaker1_a_cn_16k.wav +fi + +if [ ! -f ./speaker1_b_cn_16k.wav ]; then + wget https://www.modelscope.cn/models/iic/speech_eres2netv2_sv_zh-cn_16k-common/resolve/master/examples/speaker1_b_cn_16k.wav +fi + +if [ ! -f ./speaker2_a_cn_16k.wav ]; then + wget https://www.modelscope.cn/models/iic/speech_eres2netv2_sv_zh-cn_16k-common/resolve/master/examples/speaker2_a_cn_16k.wav +fi + +./export.py + +ls -lh + +for m in *.pt; do + ./test.py --model $m +done diff --git a/scripts/3d-speaker/test.py b/scripts/3d-speaker/test.py new file mode 100755 index 000000000..0a4bf9361 --- /dev/null +++ b/scripts/3d-speaker/test.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import argparse + +import soundfile as sf +import librosa +import torch +import kaldi_native_fbank as knf +import numpy as np +from typing import Tuple + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + ) + return parser.parse_args() + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def compute_features(filename: str) -> torch.Tensor: + """ + Args: + filename: + Path to an audio file. + Returns: + Return a 2-D float32 tensor of shape (T, 80) containing the features. + """ + wave, sample_rate = load_audio(filename) + if sample_rate != 16000: + wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + features = [] + opts = knf.FbankOptions() + + opts.frame_opts.dither = 0 + opts.frame_opts.samp_freq = 16000 + opts.mel_opts.num_bins = 80 + opts.frame_opts.snip_edges = True + + fbank = knf.OnlineFbank(opts) + fbank.accept_waveform(16000, wave) + fbank.input_finished() + for i in range(fbank.num_frames_ready): + f = fbank.get_frame(i) + f = torch.from_numpy(f) + features.append(f) + + features = torch.stack(features) + # mel (T, 80) + + features = features - features.mean(dim=0, keepdim=True) + + return features + + +@torch.inference_mode() +def main(): + args = get_args() + + print(f"----------testing {args.model}----------") + m = torch.jit.load(args.model) + m.eval() + + x1 = compute_features(filename="./speaker1_a_cn_16k.wav") + x2 = compute_features(filename="./speaker1_b_cn_16k.wav") + x3 = compute_features(filename="./speaker2_a_cn_16k.wav") + + y1 = m(x1.unsqueeze(0)).squeeze(0) + y2 = m(x2.unsqueeze(0)).squeeze(0) + y3 = m(x3.unsqueeze(0)).squeeze(0) + + print("embedding shape", y1.shape) + + x12 = torch.nn.functional.cosine_similarity(y1, y2, dim=0) + x13 = torch.nn.functional.cosine_similarity(y1, y3, dim=0) + x23 = torch.nn.functional.cosine_similarity(y2, y3, dim=0) + + print(x12, x13, x23) + print(f"----------testing {args.model} done----------") + + +if __name__ == "__main__": + main() diff --git a/scripts/conda-cpu/sherpa/meta.yaml b/scripts/conda-cpu/sherpa/meta.yaml index 9115bf0f1..b8dbe7b4a 100644 --- a/scripts/conda-cpu/sherpa/meta.yaml +++ b/scripts/conda-cpu/sherpa/meta.yaml @@ -1,6 +1,6 @@ package: name: sherpa - version: "0.8" + version: "1.3" source: path: "{{ environ.get('SHERPA_ROOT_DIR') }}" @@ -42,7 +42,6 @@ requirements: - python - pytorch={{ environ.get('SHERPA_TORCH_VERSION') }} - kaldifeat - - kaldi_native_io - k2 - cpuonly - gcc_linux-64=7 # [linux] @@ -51,7 +50,6 @@ requirements: - python - pytorch={{ environ.get('SHERPA_TORCH_VERSION') }} - kaldifeat - - kaldi_native_io - k2 about: diff --git a/scripts/github_actions/build-ubuntu-cpu.sh b/scripts/github_actions/build-ubuntu-cpu.sh new file mode 100755 index 000000000..152c9c8e1 --- /dev/null +++ b/scripts/github_actions/build-ubuntu-cpu.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# +set -ex + +if [ -z $PYTHON_VERSION ]; then + echo "Please set the environment variable PYTHON_VERSION" + echo "Example: export PYTHON_VERSION=3.8" + # Valid values: 3.8, 3.9, 3.10, 3.11 + exit 1 +fi + +if [ -z $TORCH_VERSION ]; then + echo "Please set the environment variable TORCH_VERSION" + echo "Example: export TORCH_VERSION=1.10.0" + exit 1 +fi + +echo "Installing ${PYTHON_VERSION}.3" + +yum -y install openssl-devel bzip2-devel libffi-devel xz-devel wget redhat-lsb-core + +if true; then + echo "Installing ${PYTHON_VERSION}.2" + curl -O https://www.python.org/ftp/python/${PYTHON_VERSION}.2/Python-${PYTHON_VERSION}.2.tgz + tar xf Python-${PYTHON_VERSION}.2.tgz + pushd Python-${PYTHON_VERSION}.2 + + PYTHON_INSTALL_DIR=$PWD/py-${PYTHON_VERSION} + + if [[ $PYTHON_VERSION =~ 3.1. ]]; then + yum install -y openssl11-devel + sed -i 's/PKG_CONFIG openssl /PKG_CONFIG openssl11 /g' configure + fi + + ./configure --enable-shared --prefix=$PYTHON_INSTALL_DIR >/dev/null 2>&1 + make install >/dev/null 2>&1 + + popd + + export PATH=$PYTHON_INSTALL_DIR/bin:$PATH + export LD_LIBRARY_PATH=$PYTHON_INSTALL_DIR/lib:$LD_LIBRARY_PATH + ls -lh $PYTHON_INSTALL_DIR/lib/ + + python3 --version + which python3 +else + case ${PYTHON_VERSION} in + 3.7) + export PATH=/opt/python/cp37-cp37m/bin:$PATH + ;; + 3.8) + export PATH=/opt/python/cp38-cp38/bin:$PATH + ;; + 3.9) + export PATH=/opt/python/cp39-cp39/bin:$PATH + ;; + 3.10) + export PATH=/opt/python/cp310-cp310/bin:$PATH + ;; + 3.11) + export PATH=/opt/python/cp311-cp311/bin:$PATH + ;; + esac +fi + + +nvcc --version || true +rm -rf /usr/local/cuda* +nvcc --version || true + +python3 --version +which python3 + +if [[ $PYTHON_VERSION != 3.6 ]]; then + curl -O https://bootstrap.pypa.io/get-pip.py + python3 get-pip.py +fi + +python3 -m pip install scikit-build +python3 -m pip install -U pip cmake +python3 -m pip install wheel twine typing_extensions +python3 -m pip install bs4 requests tqdm auditwheel + +echo "Installing torch $TORCH_VERSION" +python3 -m pip install -qq torch==$TORCH_VERSION+cpu -f https://download.pytorch.org/whl/torch_stable.html + +echo "Install k2 1.24.4.dev20240223+cpu.torch${TORCH_VERSION}" +pip install k2==1.24.4.dev20240223+cpu.torch${TORCH_VERSION} -f https://k2-fsa.github.io/k2/cpu.html + +echo "Installing kaldifeat 1.25.4.dev20240223+cpu.torch${TORCH_VERSION}" +pip install kaldifeat==1.25.4.dev20240223+cpu.torch${TORCH_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html + +python3 -m k2.version +python3 -c "import k2; print(k2.__file__)" +python3 -c "import kaldifeat; print(kaldifeat.__file__)" + +rm -rf ~/.cache/pip +yum clean all + +cd /var/www + +export CMAKE_CUDA_COMPILER_LAUNCHER= +export SHERPA_ARGS=" -DPYTHON_EXECUTABLE=$PYTHON_INSTALL_DIR/bin/python3 " +export SHERPA_MAKE_ARGS=" -j2 " + +python3 setup.py bdist_wheel + +pushd dist +unzip *.whl +export LD_LIBRARY_PATH=$PWD/sherpa/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$PWD/sherpa/lib64:$LD_LIBRARY_PATH +popd + +echo $LD_LIBRARY_PATH +ls -lh $PWD/dist/sherpa/lib + +auditwheel --verbose repair \ + --exclude libc10.so \ + --exclude libc10_cuda.so \ + --exclude libcuda.so.1 \ + --exclude libcudart.so.${CUDA_VERSION} \ + --exclude libnvToolsExt.so.1 \ + --exclude libnvrtc.so.${CUDA_VERSION} \ + --exclude libtorch.so \ + --exclude libtorch_cpu.so \ + --exclude libtorch_cuda.so \ + --exclude libtorch_python.so \ + \ + --exclude libcudnn.so.8 \ + --exclude libcublas.so.11 \ + --exclude libcublasLt.so.11 \ + --exclude libcudart.so.11.0 \ + --exclude libnvrtc.so.11.2 \ + --exclude libtorch_cuda_cu.so \ + --exclude libtorch_cuda_cpp.so \ + \ + --exclude libkaldifeat_core.so \ + --exclude libk2_log.so \ + --exclude libk2_torch_api.so \ + --exclude libk2context.so \ + --exclude libk2fsa.so \ + --exclude libk2_torch.so \ + \ + --plat manylinux_2_17_x86_64 \ + -w /var/www/wheels \ + dist/*.whl + +ls -lh /var/www/wheels diff --git a/scripts/github_actions/build-ubuntu-cuda.sh b/scripts/github_actions/build-ubuntu-cuda.sh new file mode 100755 index 000000000..882c4efed --- /dev/null +++ b/scripts/github_actions/build-ubuntu-cuda.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +# +set -ex + +if [ -z $PYTHON_VERSION ]; then + echo "Please set the environment variable PYTHON_VERSION" + echo "Example: export PYTHON_VERSION=3.8" + # Valid values: 3.6, 3.7, 3.8, 3.9, 3.10, 3.11 + exit 1 +fi + +if [ -z $TORCH_VERSION ]; then + echo "Please set the environment variable TORCH_VERSION" + echo "Example: export TORCH_VERSION=1.10.0" + exit 1 +fi + +if [ -z $CUDA_VERSION ]; then + echo "Please set the environment variable CUDA_VERSION" + echo "Example: export CUDA_VERSION=10.2" + # valid values: 10.2, 11.1, 11.3, 11.6, 11.7, 11.8, 12.1 + exit 1 +fi + +if [[ $TORCH_VERSION =~ 2.2.* && $CUDA_VERSION =~ 12.* ]]; then + # see https://github.com/pytorch/pytorch/issues/113948 + export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0" +fi + +yum -y install openssl-devel bzip2-devel libffi-devel xz-devel wget redhat-lsb-core +echo "Installing ${PYTHON_VERSION}.2" +curl -O https://www.python.org/ftp/python/${PYTHON_VERSION}.2/Python-${PYTHON_VERSION}.2.tgz +tar xf Python-${PYTHON_VERSION}.2.tgz +pushd Python-${PYTHON_VERSION}.2 + +PYTHON_INSTALL_DIR=$PWD/py-${PYTHON_VERSION} + +if [[ $PYTHON_VERSION =~ 3.1. ]]; then + yum install -y openssl11-devel + sed -i 's/PKG_CONFIG openssl /PKG_CONFIG openssl11 /g' configure +fi + +./configure --enable-shared --prefix=$PYTHON_INSTALL_DIR >/dev/null 2>&1 +make install >/dev/null 2>&1 + +popd + +export PATH=$PYTHON_INSTALL_DIR/bin:$PATH +export LD_LIBRARY_PATH=$PYTHON_INSTALL_DIR/lib:$LD_LIBRARY_PATH +ls -lh $PYTHON_INSTALL_DIR/lib/ + +python3 --version +which python3 + +if [[ $PYTHON_VERSION != 3.6 ]]; then + curl -O https://bootstrap.pypa.io/get-pip.py + python3 get-pip.py +fi + +python3 -m pip install scikit-build +python3 -m pip install -U pip cmake +python3 -m pip install wheel twine typing_extensions +python3 -m pip install bs4 requests tqdm auditwheel + +echo "Installing torch ${TORCH_VERSION} ${CUDA_VERSION}" +./install_torch.sh + +echo "Install k2" +pip install k2==1.24.4.dev20240223+cuda${CUDA_VERSION}.torch${TORCH_VERSION} -f https://k2-fsa.github.io/k2/cuda.html + +echo "Install kaldifeat" +pip install kaldifeat==1.25.4.dev20240223+cuda${CUDA_VERSION}.torch${TORCH_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html + +find /usr/local -name libcuda.so.1 + +python3 -m torch.utils.collect_env +python3 -m k2.version + +# rm -rf ~/.cache/pip >/dev/null 2>&1 +# yum clean all + +cd /var/www + +export CMAKE_CUDA_COMPILER_LAUNCHER= +export SHERPA_CMAKE_ARGS=" -DPYTHON_EXECUTABLE=$PYTHON_INSTALL_DIR/bin/python3 " +export SHERPA_MAKE_ARGS=" -j2 " + + +python3 setup.py bdist_wheel + +pushd dist +unzip *.whl +export LD_LIBRARY_PATH=$PWD/sherpa/lib:$LD_LIBRARY_PATH +popd + +echo $LD_LIBRARY_PATH +ls -lh $PWD/dist/sherpa/lib + +auditwheel --verbose repair \ + --exclude libc10.so \ + --exclude libc10_cuda.so \ + --exclude libcuda.so.1 \ + --exclude libcudart.so.${CUDA_VERSION} \ + --exclude libnvToolsExt.so.1 \ + --exclude libnvrtc.so.${CUDA_VERSION} \ + --exclude libtorch.so \ + --exclude libtorch_cpu.so \ + --exclude libtorch_cuda.so \ + --exclude libtorch_python.so \ + --exclude libtorch_cuda_cpp.so \ + --exclude libtorch_cuda_cu.so \ + \ + --exclude libcublas.so \ + --exclude libcublas.so.11 \ + --exclude libcublas.so.12 \ + --exclude libcublasLt.so \ + --exclude libcublasLt.so.11 \ + --exclude libcublasLt.so.12 \ + --exclude libcudart.so.10.2.89 \ + --exclude libcudart.so.11.0 \ + --exclude libcudnn.so.8 \ + --exclude libcudart.so.12 \ + --exclude libnvToolsExt.so.1.0.0 \ + --exclude libnvrtc.so.10.2.89 \ + --exclude libnvrtc.so.11.2 \ + \ + --exclude libcufft.so \ + --exclude libcufft.so.11 \ + --exclude libcupti.so \ + --exclude libcupti.so.12 \ + --exclude libcurand.so \ + --exclude libcurand.so.10 \ + --exclude libcusparse.so \ + --exclude libcusparse.so.12 \ + --exclude libnccl.so \ + --exclude libnccl.so.2 \ + --exclude libnvJitLink.so \ + --exclude libnvJitLink.so.12 \ + --exclude libnvrtc.so \ + --exclude libnvrtc.so.11.2 \ + --exclude libnvrtc.so.12 \ + --exclude libshm.so \ + \ + --exclude libkaldifeat_core.so \ + --exclude libk2_log.so \ + --exclude libk2_torch_api.so \ + --exclude libk2context.so \ + --exclude libk2fsa.so \ + --exclude libk2_torch.so \ + \ + --plat manylinux_2_17_x86_64 \ + -w /var/www/wheels \ + dist/*.whl + +ls -lh /var/www/wheels diff --git a/scripts/github_actions/generate_build_matrix.py b/scripts/github_actions/generate_build_matrix.py index 52f41cc2b..2d1774395 100755 --- a/scripts/github_actions/generate_build_matrix.py +++ b/scripts/github_actions/generate_build_matrix.py @@ -5,6 +5,18 @@ import json +def version_ge(a, b): + a_major, a_minor = list(map(int, a.split(".")))[:2] + b_major, b_minor = list(map(int, b.split(".")))[:2] + if a_major > b_major: + return True + + if a_major == b_major and a_minor >= b_minor: + return True + + return False + + def get_args(): parser = argparse.ArgumentParser() parser.add_argument( @@ -14,6 +26,20 @@ def get_args(): help="True to enable CUDA", ) + parser.add_argument( + "--for-windows", + action="store_true", + default=False, + help="True for windows", + ) + + parser.add_argument( + "--for-macos", + action="store_true", + default=False, + help="True for macOS", + ) + parser.add_argument( "--test-only-latest-torch", action="store_true", @@ -24,9 +50,13 @@ def get_args(): return parser.parse_args() -def generate_build_matrix(enable_cuda, test_only_latest_torch): +def generate_build_matrix(enable_cuda, for_windows, for_macos, test_only_latest_torch): matrix = { - # there are issues in serializing ragged tensors in 1.5.0 and 1.5.1 + # 1.5.x is removed because there are compilation errors. + # See + # https://github.com/csukuangfj/k2/runs/2533830771?check_suite_focus=true + # and + # https://github.com/NVIDIA/apex/issues/805 # "1.5.0": { # "python-version": ["3.6", "3.7", "3.8"], # "cuda": ["10.1", "10.2"], @@ -37,75 +67,147 @@ def generate_build_matrix(enable_cuda, test_only_latest_torch): # }, "1.6.0": { "python-version": ["3.6", "3.7", "3.8"], - "cuda": ["10.1", "10.2"], + "cuda": ["10.1", "10.2"] if not for_windows else ["10.1.243", "10.2.89"], }, "1.7.0": { "python-version": ["3.6", "3.7", "3.8"], - "cuda": ["10.1", "10.2", "11.0"], + "cuda": ["10.1", "10.2", "11.0"] + if not for_windows + else ["10.1.243", "10.2.89", "11.0.3"], }, "1.7.1": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.1", "10.2", "11.0"], + "cuda": ["10.1", "10.2", "11.0"] + if not for_windows + else ["10.1.243", "10.2.89", "11.0.3"], }, "1.8.0": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.1", "10.2", "11.1"], + "cuda": ["10.1", "10.2", "11.1"] + if not for_windows + else ["10.1.243", "10.2.89", "11.1.1"], }, "1.8.1": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.1", "10.2", "11.1"], + "cuda": ["10.1", "10.2", "11.1"] + if not for_windows + else ["10.1.243", "10.2.89", "11.1.1"], }, "1.9.0": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.2", "11.1"], + "cuda": ["10.2", "11.1"] if not for_windows else ["10.2.89", "11.1.1"], }, "1.9.1": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.2", "11.1"], + "cuda": ["10.2", "11.1"] if not for_windows else ["10.2.89", "11.1.1"], }, "1.10.0": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.2", "11.1", "11.3"], + "cuda": ["10.2", "11.1", "11.3"] + if not for_windows + else ["10.2.89", "11.1.1", "11.3.1"], }, "1.10.1": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.2", "11.1", "11.3"], + "cuda": ["10.2", "11.1", "11.3"] + if not for_windows + else ["10.2.89", "11.1.1", "11.3.1"], }, "1.10.2": { "python-version": ["3.6", "3.7", "3.8", "3.9"], - "cuda": ["10.2", "11.1", "11.3"], + "cuda": ["10.2", "11.1", "11.3"] + if not for_windows + else ["10.2.89", "11.1.1", "11.3.1"], }, "1.11.0": { "python-version": ["3.7", "3.8", "3.9", "3.10"], - "cuda": ["10.2", "11.3", "11.5"], + "cuda": ["10.2", "11.3", "11.5"] + if not for_windows + else ["11.3.1", "11.5.2"], }, "1.12.0": { "python-version": ["3.7", "3.8", "3.9", "3.10"], - "cuda": ["10.2", "11.3", "11.6"], + "cuda": ["10.2", "11.3", "11.6"] + if not for_windows + else ["11.3.1", "11.6.2"], }, "1.12.1": { "python-version": ["3.7", "3.8", "3.9", "3.10"], - "cuda": ["10.2", "11.3", "11.6"], + "cuda": ["10.2", "11.3", "11.6"] + if not for_windows + else ["11.3.1", "11.6.2"], + }, + "1.13.0": { + "python-version": ["3.7", "3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.6", "11.7"], # default 11.7 + }, + "1.13.1": { + "python-version": ["3.7", "3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.6", "11.7"] # default 11.7 + if not for_windows + else ["11.6.2", "11.7.1"], + }, + "2.0.0": { + "python-version": ["3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.7", "11.8"] # default 11.7 + if not for_windows + else ["11.7.1", "11.8.0"], + }, + "2.0.1": { + "python-version": ["3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.7", "11.8"] # default 11.7 + if not for_windows + else ["11.7.1", "11.8.0"], + }, + "2.1.0": { + "python-version": ["3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.8", "12.1"] # default 12.1 + if not for_windows + else ["11.8.0", "12.1.0"], + }, + "2.1.1": { + "python-version": ["3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.8", "12.1"] # default 12.1 + if not for_windows + else ["11.8.0", "12.1.0"], + }, + "2.1.2": { + "python-version": ["3.8", "3.9", "3.10", "3.11"], + "cuda": ["11.8", "12.1"] # default 12.1 + if not for_windows + else ["11.8.0", "12.1.0"], + }, + "2.2.0": { + "python-version": ["3.8", "3.9", "3.10", "3.11", "3.12"], + "cuda": ["11.8", "12.1"] # default 12.1 + if not for_windows + else ["11.8.0", "12.1.0"], + }, + "2.2.1": { + "python-version": ["3.8", "3.9", "3.10", "3.11", "3.12"], + "cuda": ["11.8", "12.1"] # default 12.1 + if not for_windows + else ["11.8.0", "12.1.0"], }, + # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/windows-links.ts } if test_only_latest_torch: - latest = "1.12.1" + latest = "2.0.1" matrix = {latest: matrix[latest]} - # We only have limited spaces in anaconda, so we exclude some - # versions of PyTorch here. If you need them, please consider - # installing sherpa from source - # Only CUDA build are excluded since it occupies more disk space - excluded_torch_versions = ["1.6.0", "1.7.0"] + if for_windows or for_macos: + if "1.13.0" in matrix: + matrix["1.13.0"]["python-version"].remove("3.11") - excluded_python_versions = ["3.6"] + if "1.13.1" in matrix: + matrix["1.13.1"]["python-version"].remove("3.11") - # os = ["ubuntu-18.04", "macos-10.15", "windows-2019"] - os = ["ubuntu-18.04", "windows-2019"] + excluded_python_versions = ["3.6", "3.7"] + enabled_torch_versions = [] ans = [] for torch, python_cuda in matrix.items(): - if torch in excluded_torch_versions and enable_cuda: + if enabled_torch_versions and torch not in enabled_torch_versions: continue python_versions = python_cuda["python-version"] @@ -116,14 +218,43 @@ def generate_build_matrix(enable_cuda, test_only_latest_torch): continue for c in cuda_versions: - ans.append({"torch": torch, "python-version": p, "cuda": c}) + if c in ["10.1", "11.0"]: + # no docker image for cuda 10.1 and 11.0 + continue + ans.append( + { + "torch": torch, + "python-version": p, + "cuda": c, + "image": f"pytorch/manylinux-builder:cuda{c}", + } + ) else: for p in python_versions: if p in excluded_python_versions: continue - for o in os: - ans.append({"torch": torch, "python-version": p, "os": o}) + if for_windows: + p = "cp" + "".join(p.split(".")) + ans.append({"torch": torch, "python-version": p}) + elif for_macos: + ans.append({"torch": torch, "python-version": p}) + elif version_ge(torch, "2.2.0"): + ans.append( + { + "torch": torch, + "python-version": p, + "image": "pytorch/manylinux-builder:cpu-2.2", + } + ) + else: + ans.append( + { + "torch": torch, + "python-version": p, + "image": "pytorch/manylinux-builder:cuda10.2", + } + ) print(json.dumps({"include": ans})) @@ -132,6 +263,8 @@ def main(): args = get_args() generate_build_matrix( enable_cuda=args.enable_cuda, + for_windows=args.for_windows, + for_macos=args.for_macos, test_only_latest_torch=args.test_only_latest_torch, ) diff --git a/scripts/github_actions/install_torch.sh b/scripts/github_actions/install_torch.sh new file mode 100755 index 000000000..5f8d1c661 --- /dev/null +++ b/scripts/github_actions/install_torch.sh @@ -0,0 +1,214 @@ +#!/bin/bash +# +# Copyright 2020 Mobvoi Inc. (authors: Fangjun Kuang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +torch=$TORCH_VERSION +cuda=$CUDA_VERSION +case ${torch} in + 1.5.*) + case ${cuda} in + 10.1) + package="torch==${torch}+cu101" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 10.2) + package="torch==${torch}" + # Leave url empty to use PyPI. + # torch_stable provides cu92 but we want cu102 + url= + ;; + esac + ;; + 1.6.0) + case ${cuda} in + 10.1) + package="torch==1.6.0+cu101" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 10.2) + package="torch==1.6.0" + # Leave it empty to use PyPI. + # torch_stable provides cu92 but we want cu102 + url= + ;; + esac + ;; + 1.7.*) + case ${cuda} in + 10.1) + package="torch==${torch}+cu101" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 10.2) + package="torch==${torch}" + # Leave it empty to use PyPI. + # torch_stable provides cu92 but we want cu102 + url= + ;; + 11.0) + package="torch==${torch}+cu110" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 1.8.*) + case ${cuda} in + 10.1) + package="torch==${torch}+cu101" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 10.2) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + 11.1) + package="torch==${torch}+cu111" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 1.9.*) + case ${cuda} in + 10.2) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + 11.1) + package="torch==${torch}+cu111" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 1.10.*) + case ${cuda} in + 10.2) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + 11.1) + package="torch==${torch}+cu111" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 11.3) + package="torch==${torch}+cu113" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 1.11.*) + case ${cuda} in + 10.2) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + 11.3) + package="torch==${torch}+cu113" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 11.5) + package="torch==${torch}+cu115" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 1.12.*) + case ${cuda} in + 10.2) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + 11.3) + package="torch==${torch}+cu113" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 11.6) + package="torch==${torch}+cu116" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 1.13.*) + case ${cuda} in + 11.6) + package="torch==${torch}+cu116" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 11.7) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + esac + ;; + 2.0.*) + case ${cuda} in + 11.7) + package="torch==${torch}+cu117" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 11.8) + package="torch==${torch}+cu118" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + esac + ;; + 2.1.*) + case ${cuda} in + 11.8) + package="torch==${torch}+cu118" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 12.1) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + esac + ;; + 2.2.*) + case ${cuda} in + 11.8) + package="torch==${torch}+cu118" + url=https://download.pytorch.org/whl/torch_stable.html + ;; + 12.1) + package="torch==${torch}" + # Leave it empty to use PyPI. + url= + ;; + esac + ;; + *) + echo "Unsupported PyTorch version: ${torch}" + exit 1 + ;; +esac + +function retry() { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) +} + +if [ x"${url}" == "x" ]; then + retry python3 -m pip install -q $package +else + retry python3 -m pip install -q $package -f $url +fi + +rm -rfv ~/.cache/pip diff --git a/scripts/github_actions/patch_wheel.py b/scripts/github_actions/patch_wheel.py new file mode 100755 index 000000000..6597a61af --- /dev/null +++ b/scripts/github_actions/patch_wheel.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang) + +import argparse +import glob +import shutil +import subprocess +import sys +from pathlib import Path + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--in-dir", + type=Path, + required=True, + help="Input directory.", + ) + + parser.add_argument( + "--out-dir", + type=Path, + required=True, + help="Output directory.", + ) + + return parser.parse_args() + + +def process(out_dir: Path, whl: Path): + tmp_dir = out_dir / "tmp" + subprocess.check_call(f"unzip {whl} -d {tmp_dir}", shell=True) + py_version = ".".join(sys.version.split(".")[:2]) + rpath_list = [ + f"$ORIGIN/../lib/python{py_version}/site-packages/k2_sherpa.libs", + f"$ORIGIN/../lib/python{py_version}/site-packages/torch/lib", + f"$ORIGIN/../lib/python{py_version}/site-packages/torch/lib64", + f"$ORIGIN/../lib/python{py_version}/site-packages/k2/lib", + f"$ORIGIN/../lib/python{py_version}/site-packages/k2/lib64", + f"$ORIGIN/../lib/python{py_version}/site-packages/kaldifeat/lib", + f"$ORIGIN/../lib/python{py_version}/site-packages/kaldifeat/lib64", + # + f"$ORIGIN/../lib/python{py_version}/dist-packages/k2_sherpa.libs", + f"$ORIGIN/../lib/python{py_version}/dist-packages/torch/lib", + f"$ORIGIN/../lib/python{py_version}/dist-packages/torch/lib64", + f"$ORIGIN/../lib/python{py_version}/dist-packages/k2/lib", + f"$ORIGIN/../lib/python{py_version}/dist-packages/k2/lib64", + f"$ORIGIN/../lib/python{py_version}/dist-packages/kaldifeat/lib", + f"$ORIGIN/../lib/python{py_version}/dist-packages/kaldifeat/lib64", + ] + rpaths = ":".join(rpath_list) + + for filename in glob.glob(f"{tmp_dir}/k2_sherpa-*data/data/bin/*", recursive=True): + print(filename) + existing_rpath = ( + subprocess.check_output(["patchelf", "--print-rpath", filename]) + .decode() + .strip() + ) + target_rpaths = rpaths + ":" + existing_rpath + subprocess.check_call( + f"patchelf --force-rpath --set-rpath '{target_rpaths}' {filename}", + shell=True, + ) + + outwheel = Path(shutil.make_archive(whl, "zip", tmp_dir)) + Path(outwheel).rename(out_dir / whl.name) + + shutil.rmtree(tmp_dir) + + +def main(): + in_dir = get_args().in_dir + out_dir = get_args().out_dir + out_dir.mkdir(exist_ok=True, parents=True) + + for whl in in_dir.glob("*.whl"): + process(out_dir, whl) + + +if __name__ == "__main__": + main() diff --git a/scripts/pyannote/segmentation/LICENSE b/scripts/pyannote/segmentation/LICENSE new file mode 100644 index 000000000..e5e0c2dad --- /dev/null +++ b/scripts/pyannote/segmentation/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 CNRS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/scripts/pyannote/segmentation/README.md b/scripts/pyannote/segmentation/README.md new file mode 100644 index 000000000..9a2ffcba2 --- /dev/null +++ b/scripts/pyannote/segmentation/README.md @@ -0,0 +1,4 @@ +# Introduction + +Models in this file are converted from +https://huggingface.co/pyannote/segmentation-3.0/tree/main diff --git a/scripts/pyannote/segmentation/export.py b/scripts/pyannote/segmentation/export.py new file mode 100755 index 000000000..76de2ddcd --- /dev/null +++ b/scripts/pyannote/segmentation/export.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import torch +import torch.nn.functional as F +from pyannote.audio import Model +from pyannote.audio.models.blocks.sincnet import SincNet +from pyannote.core.utils.generators import pairwise +from torch import nn + +""" +"linear": {'hidden_size': 128, 'num_layers': 2} +"lstm": {'hidden_size': 256, 'num_layers': 2, 'bidirectional': True, 'monolithic': True, 'dropout': 0.0, 'batch_first': True} +"num_channels": 1 +"sample_rate": 16000 +"sincnet": {'stride': 10, 'sample_rate': 16000} +""" + + +class PyanNet(torch.nn.Module): + def __init__(self, m): + super().__init__() + self.sincnet = SincNet(**m.hparams.sincnet) + + multi_layer_lstm = dict(m.hparams.lstm) + del multi_layer_lstm["monolithic"] + self.lstm = nn.LSTM(60, **multi_layer_lstm) + + lstm_out_features: int = m.hparams.lstm["hidden_size"] * ( + 2 if m.hparams.lstm["bidirectional"] else 1 + ) + self.linear = nn.ModuleList( + [ + nn.Linear(in_features, out_features) + for in_features, out_features in pairwise( + [ + lstm_out_features, + ] + + [m.hparams.linear["hidden_size"]] * m.hparams.linear["num_layers"] + ) + ] + ) + + if m.hparams.linear["num_layers"] > 0: + in_features = m.hparams.linear["hidden_size"] + else: + in_features = m.hparams.lstm["hidden_size"] * ( + 2 if m.hparams.lstm["bidirectional"] else 1 + ) + + self.classifier = nn.Linear(in_features, m.dimension) + self.activation = m.default_activation() + + def forward(self, waveforms): + """Pass forward + + Parameters + ---------- + waveforms : (batch, channel, sample) + + Returns + ------- + scores : (batch, frame, classes) + """ + + outputs = self.sincnet(waveforms) + + outputs, _ = self.lstm(torch.permute(outputs, (0, 2, 1))) + + for linear in self.linear: + outputs = F.leaky_relu(linear(outputs)) + + return self.activation(self.classifier(outputs)) + + +@torch.inference_mode() +def main(): + # You can download ./pytorch_model.bin from + # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0 + # or from + # https://huggingface.co/Revai/reverb-diarization-v1/tree/main + pt_filename = "./pytorch_model.bin" + model = Model.from_pretrained(pt_filename) + wrapper = PyanNet(model) + + num_param1 = sum([p.numel() for p in model.parameters()]) + num_param2 = sum([p.numel() for p in wrapper.parameters()]) + + assert num_param1 == num_param2, (num_param1, num_param2, model.hparams) + print(f"Number of model parameters1: {num_param1}") + print(f"Number of model parameters2: {num_param2}") + + model.eval() + + # model.to_torchscript() # won't work + + wrapper.eval() + + wrapper.load_state_dict(model.state_dict()) + + x = torch.rand(1, 1, 10 * 16000) + + y1 = model(x) + y2 = wrapper(x) + + assert y1.shape == y2.shape, (y1.shape, y2.shape) + assert torch.allclose(y1, y2), (y1.sum(), y2.sum()) + + m = torch.jit.script(wrapper) + + sample_rate = model.audio.sample_rate + assert sample_rate == 16000, sample_rate + + window_size = int(model.specifications.duration) * 16000 + receptive_field_size = int(model.receptive_field.duration * 16000) + receptive_field_shift = int(model.receptive_field.step * 16000) + + meta_data = { + "num_speakers": str(len(model.specifications.classes)), + "powerset_max_classes": str(model.specifications.powerset_max_classes), + "num_classes": str(model.dimension), + "sample_rate": str(sample_rate), + "window_size": str(window_size), + "receptive_field_size": str(receptive_field_size), + "receptive_field_shift": str(receptive_field_shift), + "model_type": "pyannote-segmentation-3.0", + "version": "1", + "maintainer": "k2-fsa", + } + + m.save("model.pt", _extra_files=meta_data) + print(meta_data) + + +if __name__ == "__main__": + torch.manual_seed(20240108) + main() diff --git a/scripts/pyannote/segmentation/vad.py b/scripts/pyannote/segmentation/vad.py new file mode 100755 index 000000000..6497c2fae --- /dev/null +++ b/scripts/pyannote/segmentation/vad.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 + +""" +./export.py + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +./vad.py --model ./model.pt --wav ./lei-jun-test.wav +""" + +import argparse +from pathlib import Path + +import librosa +import numpy as np +import soundfile as sf +from numpy.lib.stride_tricks import as_strided +import torch + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, help="Path to model.pt") + parser.add_argument("--wav", type=str, required=True, help="Path to test.wav") + + return parser.parse_args() + + +class TorchModel: + def __init__(self, filename): + + meta_data = { + "num_speakers": "", + "powerset_max_classes": "", + "num_classes": "", + "sample_rate": "", + "window_size": "", + "receptive_field_size": "", + "receptive_field_shift": "", + "model_type": "", + "version": "", + "maintainer": "", + } + + self.model = torch.jit.load("model.pt", _extra_files=meta_data) + self.model.eval() + + for k in [ + "num_speakers", + "powerset_max_classes", + "sample_rate", + "window_size", + "receptive_field_size", + "receptive_field_shift", + "version", + ]: + meta_data[k] = int(meta_data[k].decode()) + + meta_data["model_type"] = meta_data["model_type"].decode() + meta_data["maintainer"] = meta_data["maintainer"].decode() + self.meta_data = meta_data + + meta = meta_data + + self.window_size = int(meta["window_size"]) + self.sample_rate = int(meta["sample_rate"]) + self.window_shift = int(0.1 * self.window_size) + self.receptive_field_size = int(meta["receptive_field_size"]) + self.receptive_field_shift = int(meta["receptive_field_shift"]) + self.num_speakers = int(meta["num_speakers"]) + self.powerset_max_classes = int(meta["powerset_max_classes"]) + self.num_classes = int(meta["num_classes"]) + + def __call__(self, x): + """ + Args: + x: (N, num_samples) + Returns: + A tensor of shape (N, num_frames, num_classes) + """ + x = np.expand_dims(x, axis=1) + x = torch.from_numpy(x) + return self.model(x).numpy() + + +def load_wav(filename, expected_sample_rate) -> np.ndarray: + audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + if sample_rate != expected_sample_rate: + audio = librosa.resample( + audio, + orig_sr=sample_rate, + target_sr=expected_sample_rate, + ) + return audio + + +def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes): + mapping = np.zeros((num_classes, num_speakers)) + + k = 1 + for i in range(1, powerset_max_classes + 1): + if i == 1: + for j in range(0, num_speakers): + mapping[k, j] = 1 + k += 1 + elif i == 2: + for j in range(0, num_speakers): + for m in range(j + 1, num_speakers): + mapping[k, j] = 1 + mapping[k, m] = 1 + k += 1 + elif i == 3: + raise RuntimeError("Unsupported") + + return mapping + + +def to_multi_label(y, mapping): + """ + Args: + y: (num_chunks, num_frames, num_classes) + Returns: + A tensor of shape (num_chunks, num_frames, num_speakers) + """ + y = np.argmax(y, axis=-1) + labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1) + return labels + + +@torch.inference_mode() +def main(): + args = get_args() + assert Path(args.model).is_file(), args.model + assert Path(args.wav).is_file(), args.wav + + m = TorchModel(args.model) + audio = load_wav(args.wav, m.sample_rate) + # audio: (num_samples,) + print("audio", audio.shape, audio.min(), audio.max(), audio.sum()) + + num = (audio.shape[0] - m.window_size) // m.window_shift + 1 + + samples = as_strided( + audio, + shape=(num, m.window_size), + strides=(m.window_shift * audio.strides[0], audio.strides[0]), + ) + + # or use torch.Tensor.unfold + # samples = torch.from_numpy(audio).unfold(0, m.window_size, m.window_shift).numpy() + + print( + "samples", + samples.shape, + samples.mean(), + samples.sum(), + samples[:3, :3].sum(axis=-1), + ) + + if ( + audio.shape[0] < m.window_size + or (audio.shape[0] - m.window_size) % m.window_shift > 0 + ): + has_last_chunk = True + else: + has_last_chunk = False + + num_chunks = samples.shape[0] + batch_size = 32 + output = [] + for i in range(0, num_chunks, batch_size): + start = i + end = i + batch_size + # it's perfectly ok to use end > num_chunks + y = m(samples[start:end]) + output.append(y) + + if has_last_chunk: + last_chunk = audio[num_chunks * m.window_shift :] # noqa + pad_size = m.window_size - last_chunk.shape[0] + last_chunk = np.pad(last_chunk, (0, pad_size)) + last_chunk = np.expand_dims(last_chunk, axis=0) + y = m(last_chunk) + output.append(y) + + y = np.vstack(output) + # y: (num_chunks, num_frames, num_classes) + + mapping = get_powerset_mapping( + num_classes=m.num_classes, + num_speakers=m.num_speakers, + powerset_max_classes=m.powerset_max_classes, + ) + labels = to_multi_label(y, mapping=mapping) + # labels: (num_chunks, num_frames, num_speakers) + + # binary classification + labels = np.max(labels, axis=-1) + # labels: (num_chunk, num_frames) + + num_frames = ( + int( + (m.window_size + (labels.shape[0] - 1) * m.window_shift) + / m.receptive_field_shift + ) + + 1 + ) + + count = np.zeros((num_frames,)) + classification = np.zeros((num_frames,)) + weight = np.hamming(labels.shape[1]) + + for i in range(labels.shape[0]): + this_chunk = labels[i] + start = int(i * m.window_shift / m.receptive_field_shift + 0.5) + end = start + this_chunk.shape[0] + + classification[start:end] += this_chunk * weight + count[start:end] += weight + + classification /= np.maximum(count, 1e-12) + + if has_last_chunk: + stop_frame = int(audio.shape[0] / m.receptive_field_shift) + classification = classification[:stop_frame] + + classification = classification.tolist() + + onset = 0.5 + offset = 0.5 + + is_active = classification[0] > onset + start = None + if is_active: + start = 0 + + scale = m.receptive_field_shift / m.sample_rate + scale_offset = m.receptive_field_size / m.sample_rate * 0.5 + + for i in range(len(classification)): + if is_active: + if classification[i] < offset: + print( + f"{start*scale + scale_offset:.3f} -- {i*scale + scale_offset:.3f}" + ) + is_active = False + else: + if classification[i] > onset: + start = i + is_active = True + + if is_active: + print( + f"{start*scale + scale_offset:.3f} -- {(len(classification)-1)*scale + scale_offset:.3f}" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/sense-voice/export.py b/scripts/sense-voice/export.py new file mode 100755 index 000000000..82f31f6cd --- /dev/null +++ b/scripts/sense-voice/export.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import numpy as np +from funasr_torch import SenseVoiceSmall + + +def generate_tokens(m): + sp = m.tokenizer.sp + with open("tokens.txt", "w", encoding="utf-8") as f: + for i in range(sp.vocab_size()): + f.write(f"{sp.id_to_piece(i)} {i}\n") + + +def generate_bpe_model(m): + with open("bpe.model", "wb") as f: + f.write(m.tokenizer.sp.serialized_model_proto()) + + +def main(): + model_dir = "iic/SenseVoiceSmall" + model = SenseVoiceSmall(model_dir, batch_size=1, device="cpu") + + generate_tokens(model) + generate_bpe_model(model) + + meta_data = { + "model_type": "SenseVoiceSmall", + "lfr_window_size": str(model.frontend.lfr_m), + "lfr_window_shift": str(model.frontend.lfr_n), + "neg_mean": model.frontend.cmvn[0].astype(np.float32).tobytes(), + "inv_stddev": model.frontend.cmvn[1].astype(np.float32).tobytes(), + "vocab_size": str(model.tokenizer.get_vocab_size()), + "normalize_samples": "0", # input should be in the range [-32768, 32767] + "version": "1", + "model_author": "iic", + "maintainer": "k2-fsa", + "lang_auto": str(model.lid_dict["auto"]), + "lang_zh": str(model.lid_dict["zh"]), + "lang_en": str(model.lid_dict["en"]), + "lang_yue": str(model.lid_dict["yue"]), # cantonese + "lang_ja": str(model.lid_dict["ja"]), + "lang_ko": str(model.lid_dict["ko"]), + "lang_nospeech": str(model.lid_dict["nospeech"]), + "with_itn": str(model.textnorm_dict["withitn"]), + "without_itn": str(model.textnorm_dict["woitn"]), + "url": "https://huggingface.co/FunAudioLLM/SenseVoiceSmall", + } + print(meta_data) + model.ort_infer.save("model.pt", _extra_files=meta_data) + + +if __name__ == "__main__": + main() diff --git a/scripts/sense-voice/run.sh b/scripts/sense-voice/run.sh new file mode 100755 index 000000000..289d352dd --- /dev/null +++ b/scripts/sense-voice/run.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +python3 ./export.py + +ls -lh tokens.txt model.pt bpe.model + +cat >README.md << EOF +# Introduction + +Models in this file are converted from +https://www.modelscope.cn/models/iic/SenseVoiceSmall/summary +using the following script +https://github.com/k2-fsa/sherpa/blob/master/scripts/sense-voice/run.sh + +EOF diff --git a/scripts/silero-vad/.gitignore b/scripts/silero-vad/.gitignore new file mode 100644 index 000000000..40fe26353 --- /dev/null +++ b/scripts/silero-vad/.gitignore @@ -0,0 +1 @@ +*.jit diff --git a/scripts/silero-vad/export-v4.py b/scripts/silero-vad/export-v4.py new file mode 100755 index 000000000..2b178586f --- /dev/null +++ b/scripts/silero-vad/export-v4.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import torch + + +def main(): + m = torch.jit.load("./silero_vad_v4.jit") + meta_data = { + "model_type": "silero_vad", + "version": "4", + } + m.save("silero-vad-v4.pt", _extra_files=meta_data) + + +if __name__ == "__main__": + main() diff --git a/scripts/silero-vad/export-v5.py b/scripts/silero-vad/export-v5.py new file mode 100755 index 000000000..c4b41447d --- /dev/null +++ b/scripts/silero-vad/export-v5.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import torch + + +class Wrapper(torch.nn.Module): + def __init__(self, m): + super().__init__() + self.sample_rates = m.sample_rates + self.m = m + + @torch.jit.export + def audio_forward(self, x: torch.Tensor, sr: int, window_size: int = 512): + # window_size is ignored + # we wrap v5 so that it has the same interface as v4 for audio_forward + return self.m.audio_forward(x, sr) + + +def main(): + m = torch.jit.load("./silero_vad_v5.jit") + wrapper = Wrapper(m) + + meta_data = { + "version": "5", + } + m = torch.jit.script(wrapper) + m.save("silero-vad-v5.pt", _extra_files=meta_data) + + +if __name__ == "__main__": + main() diff --git a/scripts/silero-vad/run-v4.sh b/scripts/silero-vad/run-v4.sh new file mode 100755 index 000000000..8b10e0635 --- /dev/null +++ b/scripts/silero-vad/run-v4.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +if [ ! -f ./silero_vad_v4.jit ]; then + # It is silero_vad v4. You can also download it from + # https://github.com/snakers4/silero-vad/blob/v4.0/files/silero_vad.jit + # + # Note that we have renamed silero_vad.jit to silero_vad_v4.jit + # + wget https://huggingface.co/csukuangfj/tmp-files/resolve/main/silero_vad_v4.jit +fi + +if [ ! -f ./lei-jun-test.wav ]; then + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./Obama.wav ]; then + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +fi diff --git a/scripts/silero-vad/run-v5.sh b/scripts/silero-vad/run-v5.sh new file mode 100755 index 000000000..835ca275a --- /dev/null +++ b/scripts/silero-vad/run-v5.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +if [ ! -f ./silero_vad_v5.jit ]; then + # It is silero_vad v5. You can also download it from + # https://github.com/snakers4/silero-vad/blob/v5.1.2/src/silero_vad/data/silero_vad.jit + # + # Note that we have renamed silero_vad.jit to silero_vad_v5.jit + # + wget https://huggingface.co/csukuangfj/tmp-files/resolve/main/silero_vad_v5.jit +fi + +if [ ! -f ./lei-jun-test.wav ]; then + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./Obama.wav ]; then + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +fi diff --git a/scripts/silero-vad/test-v4-batch.py b/scripts/silero-vad/test-v4-batch.py new file mode 100755 index 000000000..5e50986c9 --- /dev/null +++ b/scripts/silero-vad/test-v4-batch.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +import torch +import numpy as np +import soundfile as sf +import librosa + + +def load_audio(filename: str) -> np.ndarray: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + + if sample_rate != 16000: + samples = librosa.resample( + samples, + orig_sr=sample_rate, + target_sr=16000, + ) + + return samples + + +@torch.inference_mode() +def main(): + m = torch.jit.load("./silero-vad-v4.pt") + m.eval() + + filenames = ["./lei-jun-test.wav", "./Obama.wav"] + + samples1 = load_audio(filenames[0]) + samples2 = load_audio(filenames[1]) + print(samples1.shape) + print(samples2.shape) + + samples = torch.nn.utils.rnn.pad_sequence( + [torch.from_numpy(samples1), torch.from_numpy(samples2)], + batch_first=True, + padding_value=0, + ) + print(samples.shape) + + sample_rate = 16000 + + start = 0 + window_size = 512 + out = m.audio_forward(samples, torch.tensor([sample_rate]), window_size) + # out: (batch_size, num_frames) + assert out.shape[0] == samples.shape[0], out.shape + print(out.shape) + threshold = 0.5 + out = out > threshold + min_speech_duration = 0.25 * sample_rate / window_size + min_silence_duration = 0.25 * sample_rate / window_size + + indexes = torch.nonzero(out, as_tuple=False) + duration = [samples1.shape[0] / sample_rate, samples2.shape[0] / sample_rate] + + for i in range(samples.shape[0]): + w = indexes[indexes[:, 0] == i, 1].tolist() + + result = [] + start = last = w[0] + for k in w[1:]: + if k - last < min_speech_duration: + last = k + continue + else: + if last - start > min_speech_duration: + result.append((start, last)) + start = last = k + + if last - start > min_speech_duration: + result.append((start, last)) + + final = [result[0]] + for r in result[1:]: + f = final[-1] + if r[0] - f[1] < min_silence_duration: + final[-1] = (f[0], r[1]) + else: + final.append(r) + + final = filter(lambda f: f[1] - f[0] > min_speech_duration, final) + + print(f"----------{filenames[i]}----------") + for f in final: + start = f[0] * window_size / sample_rate + end = f[1] * window_size / sample_rate + if start > duration[i] or end > duration[i]: + break + print("{:.3f} -- {:.3f}".format(start, end)) + + +if __name__ == "__main__": + main() diff --git a/scripts/silero-vad/test-v4.py b/scripts/silero-vad/test-v4.py new file mode 100755 index 000000000..4df982152 --- /dev/null +++ b/scripts/silero-vad/test-v4.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import torch +import numpy as np +import soundfile as sf +import librosa + + +def load_audio(filename: str) -> np.ndarray: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + + if sample_rate != 16000: + samples = librosa.resample( + samples, + orig_sr=sample_rate, + target_sr=16000, + ) + + return samples + + +@torch.inference_mode() +def main(): + m = torch.jit.load("./silero-vad-v4.pt") + m.eval() + + samples = load_audio("./lei-jun-test.wav") + # samples = load_audio("./Obama.wav") + print(samples.shape) + + batch_size = 1 + h = torch.zeros(2, batch_size, 64, dtype=torch.float32) + c = torch.zeros(2, batch_size, 64, dtype=torch.float32) + print(h.shape, c.shape) + + sample_rate = 16000 + + start = 0 + window_size = 512 + out = m.audio_forward( + torch.from_numpy(samples), torch.tensor([sample_rate]), window_size + ) + # out: (batch_size, num_frames) + assert out.shape[0] == batch_size, out.shape + threshold = 0.5 + out = out > threshold + min_speech_duration = 0.25 * sample_rate / window_size + min_silence_duration = 0.25 * sample_rate / window_size + print("min_speech_duration", min_speech_duration) + for i in range(batch_size): + w = out[i].tolist() + + result = [] + last = -1 + for k, f in enumerate(w): + if f >= threshold: + if last == -1: + last = k + elif last != -1: + if k - last > min_speech_duration: + result.append((last, k)) + last = -1 + + if last != -1 and k - last > min_speech_duration: + result.append((last, k)) + + if not result: + continue + print(result) + + final = [result[0]] + for r in result[1:]: + f = final[-1] + if r[0] - f[1] < min_silence_duration: + final[-1] = (f[0], r[1]) + else: + final.append(r) + + for f in final: + start = f[0] * window_size / sample_rate + end = f[1] * window_size / sample_rate + print("{:.3f} -- {:.3f}".format(start, end)) + + +if __name__ == "__main__": + main() diff --git a/scripts/silero-vad/test-v5.py b/scripts/silero-vad/test-v5.py new file mode 100755 index 000000000..da37951d4 --- /dev/null +++ b/scripts/silero-vad/test-v5.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import torch +import numpy as np +import soundfile as sf +import librosa + + +def load_audio(filename: str) -> np.ndarray: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + + if sample_rate != 16000: + samples = librosa.resample( + samples, + orig_sr=sample_rate, + target_sr=16000, + ) + + return samples + + +@torch.inference_mode() +def main(): + m = torch.jit.load("./silero-vad-v5.pt") + m.eval() + + samples = load_audio("./lei-jun-test.wav") + # samples = load_audio("./Obama.wav") + print(samples.shape) + + batch_size = 1 + h = torch.zeros(2, batch_size, 64, dtype=torch.float32) + c = torch.zeros(2, batch_size, 64, dtype=torch.float32) + print(h.shape, c.shape) + + sample_rate = 16000 + + start = 0 + window_size = 512 + out = m.audio_forward( + torch.from_numpy(samples), torch.tensor([sample_rate]), window_size + ) + # out: (batch_size, num_frames) + assert out.shape[0] == batch_size, out.shape + threshold = 0.5 + out = out > threshold + min_speech_duration = 0.25 * sample_rate / window_size + min_silence_duration = 0.25 * sample_rate / window_size + print("min_speech_duration", min_speech_duration) + for i in range(batch_size): + w = out[i].tolist() + + result = [] + last = -1 + for k, f in enumerate(w): + if f >= threshold: + if last == -1: + last = k + elif last != -1: + if k - last > min_speech_duration: + result.append((last, k)) + last = -1 + + if last != -1 and k - last > min_speech_duration: + result.append((last, k)) + + if not result: + continue + print(result) + + final = [result[0]] + for r in result[1:]: + f = final[-1] + if r[0] - f[1] < min_silence_duration: + final[-1] = (f[0], r[1]) + else: + final.append(r) + + for f in final: + start = f[0] * window_size / sample_rate + end = f[1] * window_size / sample_rate + print("{:.3f} -- {:.3f}".format(start, end)) + + +if __name__ == "__main__": + main() diff --git a/scripts/whisper/README.md b/scripts/whisper/README.md new file mode 100644 index 000000000..7e7c2518b --- /dev/null +++ b/scripts/whisper/README.md @@ -0,0 +1,7 @@ +# Introduction + +Models in this file are converted from +https://github.com/openai/whisper +using the following script +https://github.com/k2-fsa/sherpa/blob/master/scripts/whisper/run.sh + diff --git a/scripts/whisper/export.py b/scripts/whisper/export.py new file mode 100755 index 000000000..df28a759b --- /dev/null +++ b/scripts/whisper/export.py @@ -0,0 +1,622 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) +# flake8: noqa + +""" +Note: Code in this file is modified from +https://github.com/TadaoYamaoka/whisper/blob/main/to_onnx.py + +Thanks to https://github.com/TadaoYamaoka +for making the onnx export script public. +""" + +import argparse +from pathlib import Path +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +import whisper +from whisper.model import ( + AudioEncoder, + Conv1d, + LayerNorm, + MultiHeadAttention, + ResidualAttentionBlock, + TextDecoder, +) + +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + # fmt: off + choices=[ + "tiny", "tiny.en", "base", "base.en", + "small", "small.en", "medium", "medium.en", + "large-v1", "large-v2", + "large", "large-v3", "turbo", # these three have feature dim 128 + "distil-medium.en", "distil-small.en", "distil-large-v2", + # "distil-large-v3", # distil-large-v3 is not supported! + # for fine-tuned models from icefall + "medium-aishell", + ], + # fmt: on + ) + return parser.parse_args() + + +# Copied from https://pytorch.org/docs/1.9.0/_modules/torch/nn/modules/module.html#Module.get_submodule # noqa +# get_submodule was added to nn.Module at v1.9.0 +def get_submodule(model, target): + if target == "": + return model + atoms: List[str] = target.split(".") + mod: torch.nn.Module = model + for item in atoms: + if not hasattr(mod, item): + raise AttributeError( + mod._get_name() + " has no " "attribute `" + item + "`" + ) + mod = getattr(mod, item) + if not isinstance(mod, torch.nn.Module): + raise AttributeError("`" + item + "` is not " "an nn.Module") + return mod + + +class ModifiedConv1d(nn.Module): + """ + This class is to fix the following error: + + RuntimeError: + 'Tensor' object has no attribute or method '_conv_forward'.: + File "/Users/fangjun/py38/lib/python3.8/site-packages/whisper/model.py", line 48 + self, x: Tensor, weight: Tensor, bias: Optional[Tensor] + ) -> Tensor: + return super()._conv_forward( + ~~~~~~~~~~~~~~~~~~~ <--- HERE + x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype) + ) + 'Conv1d._conv_forward' is being compiled since it was called from 'Conv1d.forward' + File "/Users/fangjun/py38/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 310 + def forward(self, input: Tensor) -> Tensor: + return self._conv_forward(input, self.weight, self.bias) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE + """ + + def __init__(self, m): + super().__init__() + self.conv = nn.Conv1d( + m.in_channels, + m.out_channels, + kernel_size=m.kernel_size, + padding=m.padding, + stride=m.stride, + ) + self.conv.weight = m.weight + self.conv.bias = m.bias + + def forward(self, x: torch.Tensor): + return self.conv(x) + + +class ModifiedLayerNorm(torch.nn.Module): + """ + This class is to fix the following error: + + RuntimeError: + 'Tensor' object has no attribute or method 'forward'.: + File "/Users/fangjun/py38/lib/python3.8/site-packages/whisper/model.py", line 32 + def forward(self, x: Tensor) -> Tensor: + return super().forward(x.float()).type(x.dtype) + ~~~~~~~~~~~~~ <--- HERE + """ + + def __init__(self, m): + super().__init__() + self.layer = nn.LayerNorm(m.normalized_shape) + self.layer.weight = m.weight + self.layer.bias = m.bias + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.layer(x) + + +class AudioEncoderTensorCache(nn.Module): + """ + It wraps the whisper encoder model. + + The output from whisper encoder is used to pre-compute the cross_attn_key + and cross_attn_value. + """ + + def __init__(self, inAudioEncoder: AudioEncoder, inTextDecoder: TextDecoder): + super().__init__() + self.audioEncoder = inAudioEncoder + self.textDecoder = inTextDecoder + + def forward(self, x: Tensor): + audio_features = self.audioEncoder(x) + + n_layer_cross_k_list: List[torch.Tensor] = [] + n_layer_cross_v_list: List[torch.Tensor] = [] + for block in self.textDecoder.blocks: + n_layer_cross_k_list.append(block.cross_attn.key(audio_features)) + n_layer_cross_v_list.append(block.cross_attn.value(audio_features)) + + return torch.stack(n_layer_cross_k_list), torch.stack(n_layer_cross_v_list) + + +class MultiHeadAttentionCross(nn.Module): + def __init__(self, inMultiHeadAttention: MultiHeadAttention): + super().__init__() + self.multiHeadAttention = inMultiHeadAttention + + def forward( + self, + x: Tensor, + k: Tensor, + v: Tensor, + mask: Optional[Tensor] = None, + ): + q = self.multiHeadAttention.query(x) + # Note that k and v are from self.multiHeadAttention.key(x) + # and self.multiHeadAttention.value(x), so there is no need + # to compute them here + + n_batch, n_ctx, n_state = q.shape + scale = (n_state // self.multiHeadAttention.n_head) ** -0.25 + q = ( + q.view(q.shape[0], q.shape[1], self.multiHeadAttention.n_head, -1).permute( + 0, 2, 1, 3 + ) + * scale + ) + k = ( + k.view(k.shape[0], k.shape[1], self.multiHeadAttention.n_head, -1).permute( + 0, 2, 3, 1 + ) + * scale + ) + v = v.view(v.shape[0], v.shape[1], self.multiHeadAttention.n_head, -1).permute( + 0, 2, 1, 3 + ) + + qk = q @ k + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + + w = F.softmax(qk, dim=-1).to(q.dtype) + wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + + return self.multiHeadAttention.out(wv) + + +class MultiHeadAttentionSelf(nn.Module): + def __init__(self, inMultiHeadAttention: MultiHeadAttention): + super().__init__() + self.multiHeadAttention = inMultiHeadAttention + + def forward( + self, + x: Tensor, # (b, n_ctx , n_state) + k_cache: Tensor, # (b, n_ctx_cache, n_state) + v_cache: Tensor, # (b, n_ctx_cache, n_state) + mask: Tensor, + ): + q = self.multiHeadAttention.query(x) # (b, n_ctx, n_state) + k = self.multiHeadAttention.key(x) # (b, n_ctx, n_state) + v = self.multiHeadAttention.value(x) # (b, n_ctx, n_state) + + k_cache[:, -k.shape[1] :, :] = k # (b, n_ctx_cache + n_ctx, n_state) + v_cache[:, -v.shape[1] :, :] = v # (b, n_ctx_cache + n_ctx, n_state) + + k = k_cache + v = v_cache + + n_batch, n_ctx, n_state = q.shape + scale = (n_state // self.multiHeadAttention.n_head) ** -0.25 + q = ( + q.view(q.shape[0], q.shape[1], self.multiHeadAttention.n_head, -1).permute( + 0, 2, 1, 3 + ) + * scale + ) + k = ( + k_cache.view( + k.shape[0], k.shape[1], self.multiHeadAttention.n_head, -1 + ).permute(0, 2, 3, 1) + * scale + ) + v = v.view(v.shape[0], v.shape[1], self.multiHeadAttention.n_head, -1).permute( + 0, 2, 1, 3 + ) + + qk = q @ k + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + + w = F.softmax(qk, dim=-1).to(q.dtype) + wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + + return self.multiHeadAttention.out(wv), k_cache, v_cache + + +class ResidualAttentionBlockTensorCache(nn.Module): + def __init__(self, inResidualAttentionBlock: ResidualAttentionBlock): + super().__init__() + self.originalBlock = inResidualAttentionBlock + self.attn = MultiHeadAttentionSelf(inResidualAttentionBlock.attn) + self.cross_attn = ( + MultiHeadAttentionCross(inResidualAttentionBlock.cross_attn) + if inResidualAttentionBlock.cross_attn + else None + ) + + def forward( + self, + x: Tensor, + self_k_cache: Tensor, + self_v_cache: Tensor, + cross_k: Tensor, + cross_v: Tensor, + mask: Tensor, + ): + self_attn_x, self_k_cache_updated, self_v_cache_updated = self.attn( + self.originalBlock.attn_ln(x), self_k_cache, self_v_cache, mask=mask + ) + x = x + self_attn_x + + if self.cross_attn is not None: + x = x + self.cross_attn( + self.originalBlock.cross_attn_ln(x), cross_k, cross_v + ) + + x = x + self.originalBlock.mlp(self.originalBlock.mlp_ln(x)) + return x, self_k_cache_updated, self_v_cache_updated + + +class TextDecoderTensorCache(nn.Module): + def __init__(self, inTextDecoder: TextDecoder, in_n_ctx: int): + super().__init__() + self.textDecoder = inTextDecoder + self.n_ctx = in_n_ctx + + self.blocks = nn.ModuleList() + for orginal_block in self.textDecoder.blocks: + self.blocks.append(ResidualAttentionBlockTensorCache(orginal_block)) + + def forward( + self, + tokens: Tensor, + n_layer_self_k_cache: Tensor, + n_layer_self_v_cache: Tensor, + n_layer_cross_k: Tensor, + n_layer_cross_v: Tensor, + offset: Tensor, + ): + offset = offset.int() + x = ( + self.textDecoder.token_embedding(tokens) + + self.textDecoder.positional_embedding[ + offset[0] : offset[0] + tokens.shape[-1] + ] + ) + x = x.to(n_layer_cross_k[0].dtype) + + i = 0 + for block in self.blocks: + self_k_cache = n_layer_self_k_cache[i, :, : offset[0] + tokens.shape[-1], :] + self_v_cache = n_layer_self_v_cache[i, :, : offset[0] + tokens.shape[-1], :] + x, self_k_cache, self_v_cache = block( + x, + self_k_cache=self_k_cache, + self_v_cache=self_v_cache, + cross_k=n_layer_cross_k[i], + cross_v=n_layer_cross_v[i], + mask=self.textDecoder.mask, + ) + n_layer_self_k_cache[i, :, : offset[0] + tokens.shape[-1], :] = self_k_cache + n_layer_self_v_cache[i, :, : offset[0] + tokens.shape[-1], :] = self_v_cache + i += 1 + + x = self.textDecoder.ln(x) + + # x.shape (1, 3, 384) + # weight.shape (51684, 384) + + logits = x @ torch.transpose( + self.textDecoder.token_embedding.weight.to(x.dtype), 0, 1 + ) + + return logits, n_layer_self_k_cache, n_layer_self_v_cache + + +@torch.jit.export +def MultiHeadAttentionForwardEncoder(self, x: torch.Tensor) -> torch.Tensor: + q = self.query(x) + + k = self.key(x) + v = self.value(x) + + n_batch, n_ctx, n_state = q.shape + scale = (n_state // self.n_head) ** -0.25 + q = q.view(q.shape[0], q.shape[1], self.n_head, -1).permute(0, 2, 1, 3) * scale + k = k.view(k.shape[0], k.shape[1], self.n_head, -1).permute(0, 2, 3, 1) * scale + v = v.view(v.shape[0], v.shape[1], self.n_head, -1).permute(0, 2, 1, 3) + + qk = q @ k + + w = F.softmax(qk, dim=-1).to(q.dtype) + wv = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + + return self.out(wv) + + +@torch.jit.export +def ResidualAttentionBlockForwardEncoder(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.attn.forward_encoder(self.attn_ln(x))[0] + x = x + self.mlp(self.mlp_ln(x)) + return x + + +def AudioEncoderForward(self, x: torch.Tensor) -> torch.Tensor: + """ + x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) + the mel spectrogram of the audio + """ + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = x.permute(0, 2, 1) + + assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" + x = (x + self.positional_embedding).to(x.dtype) + + for block in self.blocks: + x = block.forward_encoder(x) + + x = self.ln_post(x) + return x + + +class Whisper(torch.nn.Module): + def __init__(self, whisper): + super().__init__() + self.encoder = AudioEncoderTensorCache(whisper.encoder, whisper.decoder) + self.decoder = TextDecoderTensorCache(whisper.decoder, whisper.dims.n_text_ctx) + + @torch.jit.ignore() + def forward(): + pass + + @torch.jit.export + def run_encoder(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: A 3-D torch tensor of shape (batch_size, dim, T) + Returns: + Return a tuple of two tensors: + - n_layer_cross_k: A 4-D tensor of shape (num_layers, batch_size, T, dim) + - n_layer_cross_v: A 4-D tensor of shape (num_layers, batch_size, T, dim) + """ + return self.encoder(x) + + @torch.jit.export + def run_decoder( + self, + tokens: torch.Tensor, + n_layer_self_k_cache: torch.Tensor, + n_layer_self_v_cache: torch.Tensor, + n_layer_cross_k: torch.Tensor, + n_layer_cross_v: torch.Tensor, + offset: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Args: + tokens: A 2-D tensor of shape (batch_size, num_tokens) + n_layer_self_k_cache: A 4-D tensor of shape (num_layers, batch_size, T, dim) + n_layer_self_v_cache: A 4-D tensor of shape (num_layers, batch_size, T, dim) + n_layer_cross_k: A 4-D tensor of shape (num_layers, batch_size, T, dim) + n_layer_cross_v: A 4-D tensor of shape (num_layers, batch_size, T, dim) + offset: A 1-D tensor of shape (batch_size,) + Returns: + Return a tuple of 3 tensors: + - logits: A 3-D tensor of shape (batch_size, num_tokens, dim) + - next_n_layer_self_k_cache, same shape as n_layer_self_k_cache + - next_n_layer_self_v_cache, same shape as n_layer_self_v_cache + """ + return self.decoder( + tokens, + n_layer_self_k_cache, + n_layer_self_v_cache, + n_layer_cross_k, + n_layer_cross_v, + offset, + ) + + +# ref: https://github.com/ggerganov/whisper.cpp/blob/master/models/convert-pt-to-ggml.py#L232 +def generate_tokens(model): + whisper_dir = Path(whisper.__file__).parent + multilingual = model.is_multilingual + tokenizer = ( + whisper_dir + / "assets" + / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken") + ) + if not tokenizer.is_file(): + raise ValueError(f"Cannot find {tokenizer}") + + with open(tokenizer, "r") as f: + contents = f.read() + tokens = { + token: int(rank) + for token, rank in (line.split() for line in contents.splitlines() if line) + } + + with open(f"tokens.txt", "w") as f: + for t, i in tokens.items(): + f.write(f"{t} {i}\n") + + +def main(): + args = get_args() + name = args.model + print(args) + print(name) + + if name == "distil-medium.en": + filename = "./distil-medium-en-original-model.bin" + if not Path(filename).is_file(): + raise ValueError( + """ + Please go to https://huggingface.co/distil-whisper/distil-medium.en + to download original-model.bin + You can use the following command to do that: + + wget -O distil-medium-en-original-model.bin https://huggingface.co/distil-whisper/distil-medium.en/resolve/main/original-model.bin + """ + ) + model = whisper.load_model(filename) + elif name == "distil-large-v2": + filename = "./distil-large-v2-original-model.bin" + if not Path(filename).is_file(): + raise ValueError( + """ + Please go to https://huggingface.co/distil-whisper/distil-large-v2 + to download original-model.bin + You can use the following command to do that: + + wget -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin + """ + ) + model = whisper.load_model(filename) + elif name == "distil-small.en": + filename = "./distil-small-en-original-model.bin" + if not Path(filename).is_file(): + raise ValueError( + """ + Please go to https://huggingface.co/distil-whisper/distil-small.en + to download original-model.bin + You can use the following command to do that: + + wget -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin + """ + ) + model = whisper.load_model(filename) + elif name == "medium-aishell": + filename = "./medium-aishell.pt" + if not Path(filename).is_file(): + raise ValueError( + """ + Please go to https://huggingface.co/yuekai/icefall_asr_aishell_whisper/tree/main/exp_medium + to download whisper-medium-aishell1-epoch-10-avg-4.pt + You can use the following command to do that: + + wget -O medium-aishell.pt https://huggingface.co/yuekai/icefall_asr_aishell_whisper/resolve/main/exp_medium/whisper-medium-aishell1-epoch-10-avg-4.pt + """ + ) + model = whisper.load_model(filename) + else: + model = whisper.load_model(name) + + print(model.dims) + + generate_tokens(model) + + model.decoder.blocks[0].attn.__class__.forward = torch.jit.ignore( + model.decoder.blocks[0].attn.__class__.forward + ) + + model.decoder.blocks[0].cross_attn.__class__.forward = torch.jit.ignore( + model.decoder.blocks[0].cross_attn.__class__.forward + ) + + model.encoder.blocks[0].attn.__class__.forward = torch.jit.ignore( + model.encoder.blocks[0].attn.__class__.forward + ) + + model.encoder.blocks[0].__class__.forward = torch.jit.ignore( + model.encoder.blocks[0].__class__.forward + ) + + model.decoder.__class__.forward = torch.jit.ignore(model.decoder.__class__.forward) + + d = {} + for name, m in model.named_modules(): + if isinstance(m, LayerNorm): + d[name] = ModifiedLayerNorm(m) + elif isinstance(m, Conv1d): + d[name] = ModifiedConv1d(m) + + for k, v in d.items(): + if "." in k: + parent, child = k.rsplit(".", maxsplit=1) + setattr(get_submodule(model, parent), child, v) + else: + setattr(model, k, v) + + w = Whisper(model) + + tokenizer = whisper.tokenizer.get_tokenizer( + model.is_multilingual, num_languages=model.num_languages + ) + + meta_data = { + "model_type": "whisper", + "comment": f"whisper-{args.model}", + "version": "1", + "maintainer": "k2-fsa", + "n_mels": str(model.dims.n_mels), + "n_audio_ctx": str(model.dims.n_audio_ctx), + "n_audio_state": str(model.dims.n_audio_state), + "n_audio_head": str(model.dims.n_audio_head), + "n_audio_layer": str(model.dims.n_audio_layer), + "n_vocab": str(model.dims.n_vocab), + "n_text_ctx": str(model.dims.n_text_ctx), + "n_text_state": str(model.dims.n_text_state), + "n_text_head": str(model.dims.n_text_head), + "n_text_layer": str(model.dims.n_text_layer), + "sot_sequence": ",".join(list(map(str, tokenizer.sot_sequence))), + "all_language_tokens": ",".join( + list(map(str, tokenizer.all_language_tokens)) + ), # a list of ids + "all_language_codes": ",".join( + tokenizer.all_language_codes + ), # e.g., en, de, zh, fr + "sot": str(tokenizer.sot), + "sot_index": str(tokenizer.sot_sequence.index(tokenizer.sot)), + "eot": str(tokenizer.eot), + "blank_id": str(tokenizer.encode(" ")[0]), + "is_multilingual": str(int(model.is_multilingual)), + "no_speech": str(tokenizer.no_speech), + "non_speech_tokens": ",".join(list(map(str, tokenizer.non_speech_tokens))), + "transcribe": str(tokenizer.transcribe), + "translate": str(tokenizer.translate), + "sot_prev": str(tokenizer.sot_prev), + "sot_lm": str(tokenizer.sot_lm), + "no_timestamps": str(tokenizer.no_timestamps), + } + + m = torch.jit.script(w) + m.save("model.pt", _extra_files=meta_data) + + print(meta_data) + + num_param = sum([p.numel() for p in w.parameters()]) + print(f"Number of model parameters: {num_param}") + + +if __name__ == "__main__": + ResidualAttentionBlock.forward_encoder = ResidualAttentionBlockForwardEncoder + MultiHeadAttention.forward_encoder = MultiHeadAttentionForwardEncoder + AudioEncoder.forward = AudioEncoderForward + main() diff --git a/scripts/whisper/notes.md b/scripts/whisper/notes.md new file mode 100644 index 000000000..949c573b1 --- /dev/null +++ b/scripts/whisper/notes.md @@ -0,0 +1,169 @@ +# Info about whisper models + +## turbo + +``` +ModelDimensions(n_mels=128, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51866, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=4) +{'model_type': 'whisper', 'comment': 'whisper-turbo', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '128', 'n_audio_ctx': '1500', 'n_audio_state': '1280', 'n_audio_head': '20', 'n_audio_layer': '32', 'n_vocab': '51866', 'n_text_ctx': '448', 'n_text_state': '1280', 'n_text_head': '20', 'n_text_layer': '4', 'sot_sequence': '50258,50259,50360', 'all_language_tokens': '50293,50269,50310,50265,50259,50317,50286,50342,50339,50282,50295,50297,50323,50344,50267,50289,50340,50273,50345,50268,50266,50299,50296,50311,50351,50330,50316,50276,50279,50327,50305,50285,50302,50348,50341,50332,50274,50288,50322,50328,50260,50309,50338,50324,50278,50301,50303,50290,50334,50326,50313,50319,50314,50318,50300,50346,50281,50325,50298,50306,50264,50358,50321,50271,50277,50287,50347,50272,50304,50352,50329,50350,50357,50356,50335,50283,50333,50337,50294,50353,50308,50320,50270,50284,50336,50275,50343,50292,50307,50355,50312,50349,50315,50291,50261,50263,50262,50280,50331,50354', 'all_language_codes': 'lt,pl,eu,fr,en,sq,hu,nn,ht, +Number of model parameters: 806958080 +-rw-r--r-- 1 runner staff 3.0G Jan 8 04:24 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:24 tokens.txt +``` + +## distil-medium.en + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1024, n_audio_head=16, n_audio_layer=24, n_vocab=51864, n_text_ctx=448, n_text_state=1024, n_text_head=16, n_text_layer=2) +{'model_type': 'whisper', 'comment': 'whisper-distil-medium.en', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1024', 'n_audio_head': '16', 'n_audio_layer': '24', 'n_vocab': '51864', 'n_text_ctx': '448', 'n_text_state': '1024', 'n_text_head': '16', 'n_text_layer': '2', 'sot_sequence': '50257', 'all_language_tokens': '50266,50295,50285,50347,50327,50316,50280,50284,50332,50355,50303,50282,50277,50281,50315,50270,50348,50283,50269,50345,50323,50331,50292,50334,50289,50341,50337,50290,50299,50343,50330,50339,50319,50350,50298,50259,50265,50321,50340,50296,50314,50293,50260,50310,50267,50273,50301,50338,50297,50333,50300,50352,50287,50329,50294,50346,50279,50304,50342,50344,50305,50349,50351,50274,50353,50313,50354,50356,50320,50264,50324,50288,50275,50258,50322,50308,50271,50325,50326,50318,50276,50291,50311,50309,50262,50268,50328,50272,50335,50261,50286,50307,50336,50278,50317,50312,50306,50263,50302', 'all_language_codes': 'pt,ml,hu,tl,oc,sq,el,da,gu,jw,az,cs,vi,ms,kk,nl,mg,ro,ca,my,sn,sd,lt,yi,ur,nn,fo,hr,fa,sa,tg,ps,mr,tt,te,zh,ja,si,tk,cy,bs,la,de,is,tr,it,bn,ht,sk,am,lv,ln,no,be,mi,bo,uk,sl,mt,lb,kn,as,haw,id,ha,mn,ba,su,pa,fr,yo,th,hi,en,km,br,ar,so,af,gl,fi,bg,hy,eu,ru,pl,ka,sv,lo,es,ta,mk,uz,he,sw,ne,et,ko,sr', 'sot': '50257', 'sot_index': '0', 'eot': '50256', 'blank_id': '220', 'is_multilingual': '0', 'no_speech': '50361', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,357,366,438,532,685,705,796,930,1058,1220,1267,1279,1303,1343,1377,1391,1635,1782,1875,2162,2361,2488,3467,4008,4211,4600,4808,5299,5855,6329,7203,9609,9959,10563,10786,11420,11709,11907,13163,13697,13700,14808,15306,16410,16791,17992,19203,19510,20724,22305,22935,27007,30109,30420,33409,34949,40283,40493,40549,47282,49146', 'transcribe': '50358', 'translate': '50357', 'sot_prev': '50360', 'sot_lm': '50359', 'no_timestamps': '50362'} +Number of model parameters: 392839168 +-rw-r--r-- 1 runner staff 1.5G Jan 8 03:54 model.pt +-rw-r--r-- 1 runner staff 816K Jan 8 03:53 tokens.txt +``` + +## distil-small.en + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=768, n_audio_head=12, n_audio_layer=12, n_vocab=51864, n_text_ctx=448, n_text_state=768, n_text_head=12, n_text_layer=4) +{'model_type': 'whisper', 'comment': 'whisper-distil-small.en', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '768', 'n_audio_head': '12', 'n_audio_layer': '12', 'n_vocab': '51864', 'n_text_ctx': '448', 'n_text_state': '768', 'n_text_head': '12', 'n_text_layer': '4', 'sot_sequence': '50257', 'all_language_tokens': '50307,50354,50273,50308,50296,50293,50282,50312,50343,50261,50295,50320,50267,50311,50338,50270,50330,50278,50279,50344,50297,50280,50342,50314,50299,50347,50331,50292,50355,50266,50336,50271,50316,50356,50263,50274,50310,50304,50332,50339,50285,50313,50345,50291,50315,50300,50341,50352,50288,50318,50260,50324,50351,50323,50353,50275,50286,50333,50264,50349,50302,50328,50340,50298,50277,50265,50268,50294,50334,50326,50276,50321,50325,50305,50259,50283,50337,50262,50301,50329,50284,50287,50289,50303,50258,50346,50319,50272,50309,50348,50281,50269,50335,50317,50322,50306,50327,50350,50290', 'all_language_codes': 'mk,ba,it,br,cy,la,cs,ne,sa,es,ml,pa,tr,hy,ht,nl,tg,he,uk,lb,sk,el,mt,bs,fa,tl,sd,lt,jw,pt,uz,ar,sq,su,ko,id,is,sl,gu,ps,hu,mn,my,bg,kk,lv,nn,ln,th,gl,de,yo,haw,sn,ha,hi,ta,am,fr,as,sr,ka,tk,te,vi,ja,pl,mi,yi,af,fi,si,so,kn,zh,ro,fo,ru,bn,be,da,no,ur,az,en,bo,mr,sv,eu,mg,ms,ca,lo,sw,km,et,oc,tt,hr', 'sot': '50257', 'sot_index': '0', 'eot': '50256', 'blank_id': '220', 'is_multilingual': '0', 'no_speech': '50361', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,357,366,438,532,685,705,796,930,1058,1220,1267,1279,1303,1343,1377,1391,1635,1782,1875,2162,2361,2488,3467,4008,4211,4600,4808,5299,5855,6329,7203,9609,9959,10563,10786,11420,11709,11907,13163,13697,13700,14808,15306,16410,16791,17992,19203,19510,20724,22305,22935,27007,30109,30420,33409,34949,40283,40493,40549,47282,49146', 'transcribe': '50358', 'translate': '50357', 'sot_prev': '50360', 'sot_lm': '50359', 'no_timestamps': '50362'} +Number of model parameters: 164980224 +-rw-r--r-- 1 runner staff 635M Jan 8 03:53 model.pt +-rw-r--r-- 1 runner staff 816K Jan 8 03:53 tokens.txt +``` + +## tiny.en + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=384, n_audio_head=6, n_audio_layer=4, n_vocab=51864, n_text_ctx=448, n_text_state=384, n_text_head=6, n_text_layer=4) +{'model_type': 'whisper', 'comment': 'whisper-tiny.en', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '384', 'n_audio_head': '6', 'n_audio_layer': '4', 'n_vocab': '51864', 'n_text_ctx': '448', 'n_text_state': '384', 'n_text_head': '6', 'n_text_layer': '4', 'sot_sequence': '50257', 'all_language_tokens': '50306,50303,50300,50349,50287,50284,50343,50291,50311,50326,50346,50301,50350,50336,50352,50273,50328,50285,50354,50295,50327,50283,50340,50334,50276,50293,50355,50344,50356,50270,50299,50296,50286,50277,50297,50312,50329,50324,50310,50263,50315,50318,50268,50259,50264,50351,50308,50339,50292,50275,50265,50309,50262,50278,50316,50279,50267,50347,50348,50331,50321,50260,50353,50330,50325,50261,50305,50322,50281,50332,50341,50342,50320,50313,50266,50314,50298,50258,50319,50304,50338,50337,50289,50269,50271,50290,50317,50294,50288,50333,50345,50302,50323,50307,50272,50274,50280,50282,50335', 'all_language_codes': 'et,az,lv,as,no,da,sa,bg,hy,af,bo,bn,tt,uz,ln,it,ka,hu,ba,ml,oc,ro,tk,yi,fi,la,jw,lb,su,nl,fa,cy,ta,vi,sk,ne,be,yo,is,ko,kk,gl,pl,zh,fr,haw,br,ps,lt,hi,ja,eu,ru,he,sq,uk,tr,tl,mg,sd,si,de,ha,tg,so,es,kn,km,ms,gu,nn,mt,pa,mn,pt,bs,te,en,mr,sl,ht,fo,ur,ca,ar,hr,sw,mi,th,am,my,sr,sn,mk,sv,id,el,cs,lo', 'sot': '50257', 'sot_index': '0', 'eot': '50256', 'blank_id': '220', 'is_multilingual': '0', 'no_speech': '50361', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,357,366,438,532,685,705,796,930,1058,1220,1267,1279,1303,1343,1377,1391,1635,1782,1875,2162,2361,2488,3467,4008,4211,4600,4808,5299,5855,6329,7203,9609,9959,10563,10786,11420,11709,11907,13163,13697,13700,14808,15306,16410,16791,17992,19203,19510,20724,22305,22935,27007,30109,30420,33409,34949,40283,40493,40549,47282,49146', 'transcribe': '50358', 'translate': '50357', 'sot_prev': '50360', 'sot_lm': '50359', 'no_timestamps': '50362'} +Number of model parameters: 37184256 +-rw-r--r-- 1 runner staff 145M Jan 8 04:23 model.pt +-rw-r--r-- 1 runner staff 816K Jan 8 04:23 tokens.txt +``` +## base.en + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=512, n_audio_head=8, n_audio_layer=6, n_vocab=51864, n_text_ctx=448, n_text_state=512, n_text_head=8, n_text_layer=6) +{'model_type': 'whisper', 'comment': 'whisper-base.en', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '512', 'n_audio_head': '8', 'n_audio_layer': '6', 'n_vocab': '51864', 'n_text_ctx': '448', 'n_text_state': '512', 'n_text_head': '8', 'n_text_layer': '6', 'sot_sequence': '50257', 'all_language_tokens': '50348,50258,50297,50273,50326,50271,50343,50293,50292,50351,50298,50309,50289,50283,50318,50321,50280,50284,50332,50316,50266,50335,50337,50286,50333,50327,50291,50331,50306,50352,50296,50269,50344,50259,50338,50270,50345,50325,50285,50275,50300,50324,50312,50307,50310,50261,50329,50279,50347,50260,50299,50277,50356,50295,50288,50276,50287,50262,50290,50353,50323,50281,50263,50336,50330,50350,50319,50355,50311,50341,50282,50320,50342,50303,50302,50349,50274,50267,50294,50305,50278,50301,50339,50314,50317,50315,50340,50304,50268,50313,50264,50308,50265,50334,50354,50328,50272,50322,50346', 'all_language_codes': 'mg,en,sk,it,af,ar,sa,la,lt,haw,te,eu,ur,ro,gl,si,el,da,gu,sq,pt,lo,fo,ta,am,oc,bg,sd,et,ln,cy,ca,lb,zh,ht,nl,my,so,hu,hi,lv,yo,ne,mk,is,es,be,uk,tl,de,fa,vi,su,ml,th,fi,no,ru,hr,ha,sn,ms,ko,uz,tg,tt,mr,jw,hy,nn,cs,pa,mt,az,sr,as,id,tr,mi,kn,he,bn,ps,bs,sw,kk,tk,sl,pl,mn,fr,br,ja,yi,ba,ka,sv,km,bo', 'sot': '50257', 'sot_index': '0', 'eot': '50256', 'blank_id': '220', 'is_multilingual': '0', 'no_speech': '50361', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,357,366,438,532,685,705,796,930,1058,1220,1267,1279,1303,1343,1377,1391,1635,1782,1875,2162,2361,2488,3467,4008,4211,4600,4808,5299,5855,6329,7203,9609,9959,10563,10786,11420,11709,11907,13163,13697,13700,14808,15306,16410,16791,17992,19203,19510,20724,22305,22935,27007,30109,30420,33409,34949,40283,40493,40549,47282,49146', 'transcribe': '50358', 'translate': '50357', 'sot_prev': '50360', 'sot_lm': '50359', 'no_timestamps': '50362'} +Number of model parameters: 71825408 +-rw-r--r-- 1 runner staff 278M Jan 8 03:53 model.pt +-rw-r--r-- 1 runner staff 816K Jan 8 03:53 tokens.txt +``` + +## small.en + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=768, n_audio_head=12, n_audio_layer=12, n_vocab=51864, n_text_ctx=448, n_text_state=768, n_text_head=12, n_text_layer=12) +{'model_type': 'whisper', 'comment': 'whisper-small.en', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '768', 'n_audio_head': '12', 'n_audio_layer': '12', 'n_vocab': '51864', 'n_text_ctx': '448', 'n_text_state': '768', 'n_text_head': '12', 'n_text_layer': '12', 'sot_sequence': '50257', 'all_language_tokens': '50289,50340,50349,50353,50296,50322,50291,50278,50330,50316,50348,50267,50263,50297,50260,50318,50300,50327,50310,50308,50355,50334,50277,50311,50336,50276,50350,50281,50293,50295,50284,50341,50309,50269,50328,50258,50261,50352,50326,50323,50301,50307,50285,50345,50292,50268,50331,50270,50294,50265,50346,50271,50279,50313,50315,50266,50283,50333,50329,50321,50274,50312,50304,50325,50343,50342,50262,50264,50275,50259,50344,50317,50290,50299,50356,50332,50337,50272,50347,50286,50288,50298,50351,50338,50306,50287,50303,50354,50280,50302,50320,50335,50319,50339,50273,50314,50305,50324,50282', 'all_language_codes': 'ur,tk,as,ha,cy,km,bg,he,tg,sq,mg,tr,ko,sk,de,gl,lv,oc,is,br,jw,yi,vi,hy,uz,fi,tt,ms,la,ml,da,nn,eu,ca,ka,en,es,ln,af,sn,bn,mk,hu,my,lt,pl,sd,nl,mi,ja,bo,ar,uk,mn,kk,pt,ro,am,be,si,id,ne,sl,so,sa,mt,ru,fr,hi,zh,lb,sw,hr,fa,su,gu,fo,sv,tl,ta,th,te,haw,ht,et,no,az,ba,el,sr,pa,lo,mr,ps,it,bs,kn,yo,cs', 'sot': '50257', 'sot_index': '0', 'eot': '50256', 'blank_id': '220', 'is_multilingual': '0', 'no_speech': '50361', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,357,366,438,532,685,705,796,930,1058,1220,1267,1279,1303,1343,1377,1391,1635,1782,1875,2162,2361,2488,3467,4008,4211,4600,4808,5299,5855,6329,7203,9609,9959,10563,10786,11420,11709,11907,13163,13697,13700,14808,15306,16410,16791,17992,19203,19510,20724,22305,22935,27007,30109,30420,33409,34949,40283,40493,40549,47282,49146', 'transcribe': '50358', 'translate': '50357', 'sot_prev': '50360', 'sot_lm': '50359', 'no_timestamps': '50362'} +Number of model parameters: 240582144 +-rw-r--r-- 1 runner staff 923M Jan 8 04:19 model.pt +-rw-r--r-- 1 runner staff 816K Jan 8 04:19 tokens.txt +``` +## medium.en + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1024, n_audio_head=16, n_audio_layer=24, n_vocab=51864, n_text_ctx=448, n_text_state=1024, n_text_head=16, n_text_layer=24) +{'model_type': 'whisper', 'comment': 'whisper-medium.en', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1024', 'n_audio_head': '16', 'n_audio_layer': '24', 'n_vocab': '51864', 'n_text_ctx': '448', 'n_text_state': '1024', 'n_text_head': '16', 'n_text_layer': '24', 'sot_sequence': '50257', 'all_language_tokens': '50287,50303,50335,50285,50347,50291,50263,50322,50338,50329,50308,50278,50271,50288,50354,50332,50266,50326,50261,50281,50348,50337,50272,50283,50269,50334,50320,50307,50324,50331,50270,50342,50356,50336,50317,50301,50302,50312,50277,50340,50316,50273,50333,50297,50289,50355,50262,50268,50323,50280,50330,50306,50295,50292,50286,50299,50259,50282,50343,50300,50310,50350,50339,50345,50314,50296,50290,50341,50351,50275,50321,50344,50313,50325,50284,50276,50318,50279,50304,50328,50346,50309,50319,50293,50298,50353,50260,50327,50267,50311,50349,50305,50352,50315,50294,50264,50274,50265,50258', 'all_language_codes': 'no,az,lo,hu,tl,bg,ko,km,ht,be,br,he,ar,th,ba,gu,pt,af,es,ms,mg,fo,sv,ro,ca,yi,pa,mk,yo,sd,nl,mt,su,uz,sw,bn,sr,ne,vi,tk,sq,it,am,sk,ur,jw,ru,pl,sn,el,tg,et,ml,lt,ta,fa,zh,cs,sa,lv,is,tt,ps,my,bs,cy,hr,nn,haw,hi,si,lb,mn,so,da,fi,gl,uk,sl,ka,bo,eu,mr,la,te,ha,de,oc,tr,hy,as,kn,ln,kk,mi,fr,id,ja,en', 'sot': '50257', 'sot_index': '0', 'eot': '50256', 'blank_id': '220', 'is_multilingual': '0', 'no_speech': '50361', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,357,366,438,532,685,705,796,930,1058,1220,1267,1279,1303,1343,1377,1391,1635,1782,1875,2162,2361,2488,3467,4008,4211,4600,4808,5299,5855,6329,7203,9609,9959,10563,10786,11420,11709,11907,13163,13697,13700,14808,15306,16410,16791,17992,19203,19510,20724,22305,22935,27007,30109,30420,33409,34949,40283,40493,40549,47282,49146', 'transcribe': '50358', 'translate': '50357', 'sot_prev': '50360', 'sot_lm': '50359', 'no_timestamps': '50362'} +Number of model parameters: 762320896 +-rw-r--r-- 1 runner staff 2.8G Jan 8 04:14 model.pt +-rw-r--r-- 1 runner staff 816K Jan 8 04:14 tokens.txt +``` + +## tiny + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=384, n_audio_head=6, n_audio_layer=4, n_vocab=51865, n_text_ctx=448, n_text_state=384, n_text_head=6, n_text_layer=4) +{'model_type': 'whisper', 'comment': 'whisper-tiny', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '384', 'n_audio_head': '6', 'n_audio_layer': '4', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '384', 'n_text_head': '6', 'n_text_layer': '4', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50286,50310,50311,50281,50324,50275,50308,50323,50289,50319,50284,50316,50262,50317,50357,50295,50322,50279,50354,50307,50347,50344,50327,50272,50355,50298,50277,50297,50282,50293,50351,50283,50271,50353,50331,50321,50303,50287,50300,50339,50315,50336,50333,50345,50305,50301,50302,50343,50264,50356,50337,50330,50335,50314,50261,50266,50349,50288,50292,50309,50291,50326,50276,50346,50332,50294,50260,50334,50285,50348,50318,50265,50328,50268,50267,50278,50338,50263,50350,50306,50290,50304,50273,50341,50352,50312,50342,50269,50259,50329,50280,50270,50299,50313,50320,50296,50274,50340,50325', 'all_language_codes': 'hu,eu,is,el,sn,id,mk,km,th,gl,ro,kk,es,sq,su,mi,si,he,ha,et,bo,sa,af,ar,ba,sk,fi,cy,ms,lt,tt,cs,nl,ln,tg,pa,sr,ta,fa,ht,bs,lo,gu,lb,sl,lv,bn,mt,ko,jw,uz,be,yi,mn,de,ja,mg,no,bg,br,hr,so,hi,my,sd,la,zh,am,da,tl,sw,fr,oc,tr,pt,vi,fo,ru,as,kn,ur,az,sv,tk,haw,hy,nn,pl,en,ka,uk,ca,te,ne,mr,ml,it,ps,yo', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 37184640 +-rw-r--r-- 1 runner staff 145M Jan 8 04:22 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:22 tokens.txt +``` +## base + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=512, n_audio_head=8, n_audio_layer=6, n_vocab=51865, n_text_ctx=448, n_text_state=512, n_text_head=8, n_text_layer=6) +{'model_type': 'whisper', 'comment': 'whisper-base', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '512', 'n_audio_head': '8', 'n_audio_layer': '6', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '512', 'n_text_head': '8', 'n_text_layer': '6', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50344,50326,50348,50317,50268,50324,50357,50342,50261,50314,50307,50315,50282,50311,50276,50338,50320,50284,50321,50335,50316,50313,50327,50328,50312,50269,50278,50329,50319,50274,50301,50264,50349,50288,50331,50302,50343,50310,50345,50336,50265,50354,50289,50273,50340,50259,50303,50309,50286,50308,50294,50277,50262,50283,50332,50351,50322,50266,50300,50298,50299,50353,50334,50318,50350,50279,50271,50325,50267,50305,50337,50297,50292,50290,50346,50260,50270,50339,50285,50330,50263,50280,50272,50306,50355,50341,50356,50287,50293,50281,50291,50347,50296,50304,50275,50333,50323,50295,50352', 'all_language_codes': 'sa,so,tl,sq,tr,sn,su,nn,de,mn,et,bs,ms,is,hi,fo,mr,ro,pa,yi,kk,ne,af,oc,hy,pl,vi,ka,gl,it,lv,ko,mg,no,tg,bn,mt,eu,lb,lo,fr,ha,th,sv,ps,en,sr,br,hu,mk,la,fi,es,cs,sd,tt,si,ja,fa,sk,te,ln,am,sw,as,he,nl,yo,pt,sl,uz,cy,bg,ur,my,zh,ca,ht,da,be,ru,uk,ar,kn,ba,tk,jw,ta,lt,el,hr,bo,ml,az,id,gu,km,mi,haw', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 71825920 ++ ls -lh model.pt tokens.txt +-rw-r--r-- 1 runner staff 278M Jan 8 03:53 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 03:53 tokens.txt +``` + +## small + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=768, n_audio_head=12, n_audio_layer=12, n_vocab=51865, n_text_ctx=448, n_text_state=768, n_text_head=12, n_text_layer=12) +{'model_type': 'whisper', 'comment': 'whisper-small', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '768', 'n_audio_head': '12', 'n_audio_layer': '12', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '768', 'n_text_head': '12', 'n_text_layer': '12', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50350,50280,50279,50315,50271,50259,50328,50321,50341,50283,50276,50306,50289,50338,50275,50286,50320,50303,50351,50333,50336,50273,50297,50287,50261,50329,50311,50347,50294,50284,50335,50312,50267,50263,50339,50265,50269,50309,50314,50290,50353,50322,50345,50260,50356,50349,50293,50305,50316,50291,50323,50266,50295,50330,50307,50342,50281,50318,50354,50278,50270,50272,50304,50334,50302,50355,50340,50277,50332,50296,50298,50346,50357,50331,50326,50262,50268,50288,50317,50352,50301,50282,50292,50344,50313,50324,50308,50319,50348,50325,50343,50285,50327,50264,50310,50299,50337,50274,50300', 'all_language_codes': 'as,uk,he,bs,nl,en,oc,pa,tk,cs,hi,kn,th,fo,id,hu,mr,sr,tt,gu,lo,sv,cy,ta,de,ka,is,bo,la,ro,yi,hy,pt,ru,ht,fr,pl,br,mn,ur,ln,si,lb,zh,jw,mg,lt,sl,kk,hr,km,ja,mi,be,et,nn,el,sw,ha,vi,ca,ar,az,am,bn,ba,ps,fi,sd,ml,sk,my,su,tg,so,es,tr,no,sq,haw,lv,ms,bg,sa,ne,sn,mk,gl,tl,yo,mt,da,af,ko,eu,te,uz,it,fa', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 240582912 +-rw-r--r-- 1 runner staff 923M Jan 8 04:19 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:19 tokens.txt +``` +## medium + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1024, n_audio_head=16, n_audio_layer=24, n_vocab=51865, n_text_ctx=448, n_text_state=1024, n_text_head=16, n_text_layer=24) +{'model_type': 'whisper', 'comment': 'whisper-medium', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1024', 'n_audio_head': '16', 'n_audio_layer': '24', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '1024', 'n_text_head': '16', 'n_text_layer': '24', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50285,50324,50340,50259,50263,50288,50282,50265,50296,50262,50332,50325,50287,50264,50314,50341,50335,50283,50292,50306,50318,50321,50278,50356,50267,50279,50350,50315,50353,50337,50347,50348,50303,50260,50310,50302,50301,50316,50266,50299,50346,50354,50355,50351,50304,50313,50305,50342,50334,50330,50343,50336,50291,50349,50309,50294,50328,50289,50286,50293,50261,50338,50276,50295,50281,50319,50329,50331,50300,50298,50320,50333,50352,50345,50272,50308,50339,50284,50273,50307,50326,50312,50323,50275,50270,50277,50344,50290,50269,50322,50280,50317,50271,50297,50311,50327,50274,50268,50357', 'all_language_codes': 'da,sn,ps,en,ru,no,ms,fr,ml,es,sd,yo,ta,ko,mn,tk,yi,cs,bg,kn,sw,pa,vi,jw,pt,he,as,bs,ln,uz,bo,tl,sr,zh,eu,bn,lv,kk,ja,te,my,ha,ba,tt,az,ne,sl,nn,am,be,mt,lo,hr,mg,br,la,oc,th,hu,lt,de,fo,hi,mi,el,gl,ka,tg,fa,sk,mr,gu,haw,lb,ar,mk,ht,ro,sv,et,so,hy,km,id,ca,fi,sa,ur,pl,si,uk,sq,nl,cy,is,af,it,tr,su', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 762321920 +-rw-r--r-- 1 runner staff 2.8G Jan 8 04:05 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:05 tokens.txt +``` + +## medium-aishell + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1024, n_audio_head=16, n_audio_layer=24, n_vocab=51865, n_text_ctx=448, n_text_state=1024, n_text_head=16, n_text_layer=24) +{'model_type': 'whisper', 'comment': 'whisper-medium-aishell', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1024', 'n_audio_head': '16', 'n_audio_layer': '24', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '1024', 'n_text_head': '16', 'n_text_layer': '24', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50352,50261,50324,50312,50315,50338,50331,50284,50353,50337,50275,50286,50305,50263,50306,50269,50277,50339,50299,50345,50295,50282,50314,50350,50310,50262,50348,50260,50330,50326,50294,50351,50332,50308,50289,50349,50279,50303,50343,50321,50265,50342,50281,50334,50297,50296,50283,50311,50300,50357,50264,50291,50323,50318,50290,50259,50270,50340,50301,50313,50304,50292,50320,50273,50287,50272,50285,50316,50354,50276,50266,50347,50309,50356,50346,50327,50329,50278,50335,50355,50344,50328,50298,50333,50274,50271,50288,50267,50336,50322,50280,50317,50325,50319,50302,50307,50341,50293,50268', 'all_language_codes': 'haw,de,sn,hy,bs,fo,tg,ro,ln,uz,id,hu,sl,ru,kn,pl,fi,ht,te,lb,mi,ms,mn,as,eu,es,tl,zh,be,so,la,tt,sd,mk,th,mg,he,sr,mt,pa,fr,nn,el,am,cy,ml,cs,is,fa,su,ko,hr,km,sw,ur,en,ca,ps,lv,ne,az,bg,mr,sv,ta,ar,da,kk,ha,hi,ja,bo,br,jw,my,af,ka,vi,yi,ba,sa,oc,sk,gu,it,nl,no,pt,lo,si,uk,sq,yo,gl,bn,et,tk,lt,tr', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 762321920 +-rw-r--r-- 1 runner staff 2.8G Jan 8 04:11 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:11 tokens.txt +``` + +## large + +``` +ModelDimensions(n_mels=128, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51866, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=32) +{'model_type': 'whisper', 'comment': 'whisper-large', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '128', 'n_audio_ctx': '1500', 'n_audio_state': '1280', 'n_audio_head': '20', 'n_audio_layer': '32', 'n_vocab': '51866', 'n_text_ctx': '448', 'n_text_state': '1280', 'n_text_head': '20', 'n_text_layer': '32', 'sot_sequence': '50258,50259,50360', 'all_language_tokens': '50357,50353,50315,50356,50259,50280,50349,50278,50324,50330,50354,50351,50299,50319,50311,50268,50276,50323,50275,50352,50344,50297,50335,50293,50263,50301,50302,50332,50273,50346,50292,50331,50355,50274,50303,50283,50316,50287,50264,50327,50266,50333,50329,50321,50345,50322,50265,50347,50350,50284,50339,50281,50343,50288,50261,50270,50334,50279,50308,50272,50271,50325,50291,50269,50313,50304,50305,50298,50336,50306,50300,50282,50317,50342,50267,50358,50309,50260,50328,50326,50348,50277,50320,50295,50294,50285,50286,50262,50307,50337,50290,50314,50318,50340,50312,50289,50341,50338,50310,50296', 'all_language_codes': 'su,ln,bs,jw,en,uk,mg,vi,sn,be,ha,tt,te,gl,is,tr,hi,km,id,haw,sa,cy,yi,lt,ru,lv,bn,sd,sv,my,bg,tg,ba,it,sr,cs,kk,ta,ko,af,ja,gu,ka,pa,lb,si,fr,bo,as,ro,ht,el,mt,no,de,ca,am,he,mk,ar,nl,yo,hr,pl,ne,az,sl,sk,lo,kn,fa,ms,sq,nn,pt,yue,br,zh,oc,so,tl,fi,mr,mi,la,da,hu,es,et,uz,ur,mn,sw,ps,hy,th,tk,fo,eu,ml', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50363', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50360', 'translate': '50359', 'sot_prev': '50362', 'sot_lm': '50361', 'no_timestamps': '50364'} +Number of model parameters: 1541570560 +-rw-r--r-- 1 runner staff 5.8G Jan 8 04:00 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 03:59 tokens.txt +``` + +## large-v1 + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51865, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=32) +{'model_type': 'whisper', 'comment': 'whisper-large-v1', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1280', 'n_audio_head': '20', 'n_audio_layer': '32', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '1280', 'n_text_head': '20', 'n_text_layer': '32', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50349,50329,50332,50260,50327,50287,50326,50296,50291,50314,50273,50300,50281,50338,50288,50335,50342,50308,50319,50271,50282,50283,50270,50299,50279,50318,50265,50310,50285,50313,50322,50320,50354,50331,50340,50280,50311,50307,50337,50346,50267,50339,50290,50350,50304,50328,50336,50275,50278,50344,50272,50305,50276,50309,50284,50323,50303,50334,50289,50357,50268,50312,50352,50259,50351,50355,50292,50286,50348,50298,50264,50353,50295,50293,50274,50343,50325,50347,50333,50262,50302,50297,50317,50315,50321,50345,50277,50330,50356,50324,50263,50301,50306,50294,50261,50269,50316,50341,50266', 'all_language_codes': 'mg,ka,sd,zh,af,ta,so,ml,hr,mn,sv,fa,el,fo,no,yi,nn,mk,gl,nl,ms,cs,ca,te,he,sw,fr,eu,da,ne,si,mr,ha,tg,ps,uk,is,et,uz,my,pt,ht,ur,as,az,oc,lo,id,vi,sa,ar,sl,hi,br,ro,km,sr,am,th,su,tr,hy,haw,en,tt,ba,bg,hu,tl,sk,ko,ln,mi,lt,it,mt,yo,bo,gu,es,bn,cy,sq,bs,pa,lb,fi,be,jw,sn,ru,lv,kn,la,de,pl,kk,tk,ja', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 1541384960 +-rw-r--r-- 1 runner staff 5.8G Jan 8 04:00 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 03:59 tokens.txt +``` + +## large-v2 + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51865, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=32) +{'model_type': 'whisper', 'comment': 'whisper-large-v2', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1280', 'n_audio_head': '20', 'n_audio_layer': '32', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '1280', 'n_text_head': '20', 'n_text_layer': '32', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50328,50333,50280,50330,50355,50270,50354,50327,50294,50349,50264,50289,50357,50287,50291,50356,50337,50315,50304,50274,50308,50275,50267,50259,50281,50331,50336,50310,50329,50340,50302,50309,50353,50332,50335,50285,50297,50276,50316,50263,50261,50347,50290,50277,50260,50283,50269,50325,50323,50352,50301,50312,50300,50343,50293,50271,50342,50345,50282,50298,50324,50307,50286,50314,50320,50288,50284,50303,50346,50344,50338,50351,50292,50273,50313,50268,50318,50272,50321,50306,50350,50295,50319,50279,50322,50348,50296,50299,50317,50326,50266,50311,50334,50341,50278,50265,50339,50305,50262', 'all_language_codes': 'oc,gu,uk,be,ba,ca,ha,af,la,mg,ko,th,su,ta,hr,jw,uz,bs,az,it,mk,id,pt,en,el,tg,lo,eu,ka,ps,bn,br,ln,sd,yi,da,cy,hi,kk,ru,de,bo,ur,fi,zh,cs,pl,yo,km,haw,lv,hy,fa,mt,lt,nl,nn,lb,ms,sk,sn,et,hu,mn,mr,no,ro,sr,my,sa,fo,tt,bg,sv,ne,tr,sw,ar,pa,kn,as,mi,gl,he,si,tl,ml,te,sq,so,ja,is,am,tk,vi,fr,ht,sl,es', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 1541384960 +-rw-r--r-- 1 runner staff 5.8G Jan 8 04:14 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:13 tokens.txt +``` + +## large-v3 + +``` +ModelDimensions(n_mels=128, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51866, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=32) +{'model_type': 'whisper', 'comment': 'whisper-large-v3', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '128', 'n_audio_ctx': '1500', 'n_audio_state': '1280', 'n_audio_head': '20', 'n_audio_layer': '32', 'n_vocab': '51866', 'n_text_ctx': '448', 'n_text_state': '1280', 'n_text_head': '20', 'n_text_layer': '32', 'sot_sequence': '50258,50259,50360', 'all_language_tokens': '50259,50325,50281,50305,50285,50282,50261,50263,50341,50333,50302,50273,50326,50354,50279,50336,50271,50340,50265,50348,50346,50358,50299,50283,50352,50294,50298,50317,50295,50345,50321,50330,50262,50280,50312,50349,50331,50269,50287,50290,50319,50353,50332,50350,50311,50339,50286,50291,50313,50351,50315,50301,50276,50266,50334,50338,50309,50356,50292,50347,50342,50344,50306,50335,50303,50264,50314,50323,50288,50297,50310,50293,50343,50296,50318,50320,50289,50277,50355,50275,50304,50267,50260,50316,50270,50274,50337,50278,50284,50357,50268,50324,50322,50329,50272,50300,50328,50307,50308,50327', 'all_language_codes': 'en,yo,el,sl,da,ms,de,ru,tk,gu,bn,sv,so,ha,he,lo,nl,ps,fr,tl,my,yue,te,cs,haw,la,sk,sq,mi,lb,pa,be,es,uk,hy,mg,tg,pl,ta,ur,gl,ln,sd,as,is,ht,hu,hr,ne,tt,bs,lv,hi,ja,am,fo,br,jw,bg,bo,nn,sa,kn,yi,sr,ko,mn,km,no,cy,eu,lt,mt,ml,sw,mr,th,fi,ba,id,az,pt,zh,kk,ca,it,uz,vi,ro,su,tr,sn,si,ka,ar,fa,oc,et,mk,af', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50363', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50360', 'translate': '50359', 'sot_prev': '50362', 'sot_lm': '50361', 'no_timestamps': '50364'} +Number of model parameters: 1541570560 +-rw-r--r-- 1 runner staff 5.8G Jan 8 04:08 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 04:07 tokens.txt +``` + +## distil-large-v2 + +``` +ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=1280, n_audio_head=20, n_audio_layer=32, n_vocab=51865, n_text_ctx=448, n_text_state=1280, n_text_head=20, n_text_layer=2) +{'model_type': 'whisper', 'comment': 'whisper-distil-large-v2', 'version': '1', 'maintainer': 'k2-fsa', 'n_mels': '80', 'n_audio_ctx': '1500', 'n_audio_state': '1280', 'n_audio_head': '20', 'n_audio_layer': '32', 'n_vocab': '51865', 'n_text_ctx': '448', 'n_text_state': '1280', 'n_text_head': '20', 'n_text_layer': '2', 'sot_sequence': '50258,50259,50359', 'all_language_tokens': '50315,50271,50272,50330,50339,50344,50355,50291,50322,50303,50309,50296,50286,50262,50314,50336,50280,50276,50321,50320,50326,50353,50313,50305,50348,50274,50304,50290,50284,50269,50289,50337,50318,50264,50300,50293,50350,50335,50282,50316,50342,50357,50328,50345,50298,50347,50332,50263,50275,50267,50338,50292,50325,50310,50265,50349,50299,50356,50273,50343,50307,50302,50323,50270,50329,50352,50281,50266,50277,50319,50334,50260,50283,50354,50327,50311,50346,50268,50297,50259,50285,50301,50295,50317,50279,50324,50312,50351,50287,50306,50261,50278,50340,50294,50333,50341,50308,50288,50331', 'all_language_codes': 'bs,nl,ar,be,ht,sa,ba,hr,si,sr,br,ml,hu,es,mn,lo,uk,hi,pa,mr,so,ln,ne,sl,tl,it,az,ur,ro,pl,th,uz,sw,ko,fa,lt,as,yi,ms,kk,nn,su,oc,lb,sk,bo,sd,ru,id,pt,fo,bg,yo,eu,fr,mg,te,jw,sv,mt,et,bn,km,ca,ka,haw,el,ja,fi,gl,am,zh,cs,ha,af,is,my,tr,cy,en,da,lv,mi,sq,he,sn,hy,tt,ta,kn,de,vi,ps,la,gu,tk,mk,no,tg', 'sot': '50258', 'sot_index': '0', 'eot': '50257', 'blank_id': '220', 'is_multilingual': '1', 'no_speech': '50362', 'non_speech_tokens': '1,2,7,8,9,10,14,25,26,27,28,29,31,58,59,60,61,62,63,90,91,92,93,359,503,522,542,873,893,902,918,922,931,1350,1853,1982,2460,2627,3246,3253,3268,3536,3846,3961,4183,4667,6585,6647,7273,9061,9383,10428,10929,11938,12033,12331,12562,13793,14157,14635,15265,15618,16553,16604,18362,18956,20075,21675,22520,26130,26161,26435,28279,29464,31650,32302,32470,36865,42863,47425,49870,50254', 'transcribe': '50359', 'translate': '50358', 'sot_prev': '50361', 'sot_lm': '50360', 'no_timestamps': '50363'} +Number of model parameters: 754300160 +-rw-r--r-- 1 runner staff 2.8G Jan 8 03:54 model.pt +-rw-r--r-- 1 runner staff 798K Jan 8 03:54 tokens.txt +``` + diff --git a/scripts/whisper/run.sh b/scripts/whisper/run.sh new file mode 100755 index 000000000..6e0b3b907 --- /dev/null +++ b/scripts/whisper/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -ex + +if [ -z $name ]; then + name=tiny.en +fi + +python3 ./export.py --model $name +ls -lh model.pt tokens.txt + + +cat >README.md << EOF +# Introduction + +Models in this file are converted from +https://github.com/openai/whisper +using the following script +https://github.com/k2-fsa/sherpa/blob/master/scripts/whisper/run.sh + +EOF diff --git a/scripts/whisper/test.py b/scripts/whisper/test.py new file mode 100755 index 000000000..84aa5369a --- /dev/null +++ b/scripts/whisper/test.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import base64 +from typing import Tuple + +import kaldi_native_fbank as knf +import numpy as np +import soundfile as sf +import torch + +import whisper + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def compute_features(filename: str, dim: int = 80) -> torch.Tensor: + """ + Args: + filename: + Path to an audio file. + Returns: + Return a 3-D float32 tensor of shape (1, 80, 3000) containing the features. + """ + wave, sample_rate = load_audio(filename) + if sample_rate != 16000: + import librosa + + wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + features = [] + opts = knf.WhisperFeatureOptions() + opts.dim = dim + online_whisper_fbank = knf.OnlineWhisperFbank(opts) + online_whisper_fbank.accept_waveform(16000, wave) + online_whisper_fbank.input_finished() + for i in range(online_whisper_fbank.num_frames_ready): + f = online_whisper_fbank.get_frame(i) + f = torch.from_numpy(f) + features.append(f) + + features = torch.stack(features) + + log_spec = torch.clamp(features, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + mel = (log_spec + 4.0) / 4.0 + # mel (T, 80) + + target = 3000 + if mel.shape[0] > target: + # -50 so that there are some zero tail paddings. + mel = mel[: target - 50] + mel = torch.nn.functional.pad(mel, (0, 0, 0, 50), "constant", 0) + + if mel.shape[0] < target: + mel = torch.nn.functional.pad( + mel, (0, 0, 0, target - mel.shape[0]), "constant", 0 + ) + else: + mel = mel[:target] + + mel = mel.t().unsqueeze(0) + + # mel: (1, 80, 3000) + + return mel + + +def load_tokens(filename): + tokens = dict() + with open(filename, "r") as f: + for line in f: + t, i = line.split() + tokens[int(i)] = t + return tokens + + +@torch.inference_mode() +def main(): + meta_data = { + "model_type": "", + "comment": "", + "version": "", + "maintainer": "", + "n_mels": "", + "n_audio_ctx": "", + "n_audio_state": "", + "n_audio_head": "", + "n_audio_layer": "", + "n_vocab": "", + "n_text_ctx": "", + "n_text_state": "", + "n_text_head": "", + "n_text_layer": "", + "sot_sequence": "", + "all_language_tokens": "", + "all_language_codes": "", + "sot": "", + "sot_index": "", + "eot": "", + "blank_id": "", + "is_multilingual": "", + "no_speech": "", + "non_speech_tokens": "", + "transcribe": "", + "translate": "", + "sot_prev": "", + "sot_lm": "", + "no_timestamps": "", + } + + m = torch.jit.load("model.pt", _extra_files=meta_data) + m.eval() + + for k in ["model_type", "comment", "maintainer"]: + meta_data[k] = meta_data[k].decode() + + for k in [ + "version", + "n_mels", + "n_audio_ctx", + "n_audio_state", + "n_audio_head", + "n_audio_layer", + "n_vocab", + "n_text_ctx", + "n_text_state", + "n_text_head", + "n_text_layer", + "sot", + "sot_index", + "eot", + "blank_id", + "is_multilingual", + "no_speech", + "transcribe", + "translate", + "sot_prev", + "sot_lm", + "no_timestamps", + ]: + meta_data[k] = int(meta_data[k].decode()) + + for k in ["sot_sequence", "all_language_tokens", "non_speech_tokens"]: + meta_data[k] = list(map(int, meta_data[k].decode().split(","))) + + for k in ["all_language_codes"]: + meta_data[k] = meta_data[k].decode().split(",") + print(meta_data) + + mel = compute_features("./0.wav", dim=meta_data["n_mels"]) + print(mel.shape) + + n_layer_cross_k, n_layer_cross_v = m.run_encoder(mel) + sot_sequence = meta_data["sot_sequence"] + lang2id = dict() + for i, n in zip(meta_data["all_language_tokens"], meta_data["all_language_codes"]): + lang2id[n] = i + + sot_sequence.append(meta_data["no_timestamps"]) + + tokens = torch.tensor(sot_sequence).unsqueeze(0) + + n_audio = 1 + n_layer_self_k_cache = torch.zeros( + ( + meta_data["n_text_layer"], + n_audio, + meta_data["n_text_ctx"], + meta_data["n_text_state"], + ), + device=mel.device, + ) + n_layer_self_v_cache = torch.zeros( + ( + meta_data["n_text_layer"], + n_audio, + meta_data["n_text_ctx"], + meta_data["n_text_state"], + ), + device=mel.device, + ) + offset = torch.zeros(1, dtype=torch.int32).to(mel.device) + + n_layer_cross_k, n_layer_cross_v = m.run_encoder(mel) + print("n_layer_cross_k.shape", n_layer_cross_k.shape, n_layer_cross_v.shape) + + logits, n_layer_self_k_cache, n_layer_self_v_cache = m.run_decoder( + tokens, + n_layer_self_k_cache=n_layer_self_k_cache, + n_layer_self_v_cache=n_layer_self_v_cache, + n_layer_cross_k=n_layer_cross_k, + n_layer_cross_v=n_layer_cross_v, + offset=offset, + ) + print( + "logits.shape", + logits.shape, + n_layer_self_v_cache.shape, + n_layer_self_v_cache.shape, + ) + + offset += tokens.shape[1] + # logits.shape (batch_size, tokens.shape[1], vocab_size) + logits = logits[0, -1] + # logits = logits.softmax(dim=-1) + # for greedy search, we don't need to compute softmax or log_softmax + max_token_id = logits.argmax(dim=-1) + results = [] + + for i in range(meta_data["n_text_ctx"]): + if max_token_id == meta_data["eot"]: + break + results.append(max_token_id.item()) + tokens = torch.tensor([[results[-1]]]) + + logits, n_layer_self_k_cache, n_layer_self_v_cache = m.run_decoder( + tokens=tokens, + n_layer_self_k_cache=n_layer_self_k_cache, + n_layer_self_v_cache=n_layer_self_v_cache, + n_layer_cross_k=n_layer_cross_k, + n_layer_cross_v=n_layer_cross_v, + offset=offset, + ) + offset += 1 + logits = logits[0, -1] + max_token_id = logits.argmax(dim=-1) + print(results) + + token2id = load_tokens("./tokens.txt") + + s = b"" + for i in results: + if i in token2id: + s += base64.b64decode(token2id[i]) + + print(s.decode().strip()) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index a9c443d7c..c9340e30d 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,47 @@ #!/usr/bin/env python3 -import re +import os import sys from pathlib import Path import setuptools -from cmake.cmake_extension import BuildExtension, bdist_wheel, cmake_extension +from cmake.cmake_extension import ( + BuildExtension, + bdist_wheel, + cmake_extension, + is_windows, +) + +import get_version + +get_package_version = get_version.get_package_version + +if sys.argv[1] != "sdist": + if "K2_INSTALL_PREFIX" not in os.environ: + try: + import k2 # noqa + except ImportError: + sys.exit( + """Please install k2 first. See + https://k2-fsa.github.io/sherpa/python/installation/index.html + for details.""" + ) + + del k2 + + if "KALDIFEAT_INSTALL_PREFIX" not in os.environ: + try: + import kaldifeat # noqa + except ImportError: + sys.exit( + """Please install kaldifeat first. See + https://k2-fsa.github.io/sherpa/python/installation/index.html + for details.""" + ) + + del kaldifeat + if sys.version_info < (3,): # fmt: off @@ -30,21 +65,23 @@ def read_long_description(): return readme -def get_package_version(): - with open("CMakeLists.txt") as f: - content = f.read() - - match = re.search(r"set\(SHERPA_VERSION (.*)\)", content) - latest_version = match.group(1).strip('"') - return latest_version - - def get_binaries_to_install(): bin_dir = Path("build") / "sherpa" / "bin" bin_dir.mkdir(parents=True, exist_ok=True) + suffix = ".exe" if is_windows() else "" + # Remember to also change cmake/cmake_extension.py + binaries = ["sherpa-offline"] + binaries += ["sherpa-online", "sherpa-version"] + binaries += ["sherpa-online-microphone"] + binaries += ["sherpa-offline-microphone"] + binaries += ["sherpa-offline-websocket-server"] + binaries += ["sherpa-offline-websocket-client"] + binaries += ["sherpa-online-websocket-server"] + binaries += ["sherpa-online-websocket-client"] + binaries += ["sherpa-online-websocket-client-microphone"] exe = [] - for f in ["sherpa", "sherpa-version"]: - t = bin_dir / f + for f in binaries: + t = bin_dir / (f + suffix) exe.append(str(t)) return exe @@ -64,6 +101,7 @@ def get_binaries_to_install(): }, data_files=[("bin", get_binaries_to_install())], packages=["sherpa"], + package_data={"sherpa": ["py.typed", "*.pyi"]}, url="https://github.com/k2-fsa/sherpa", long_description=read_long_description(), long_description_content_type="text/markdown", @@ -77,6 +115,8 @@ def get_binaries_to_install(): "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], python_requires=">=3.7.0", diff --git a/sherpa/CMakeLists.txt b/sherpa/CMakeLists.txt index 121a9a7da..d3e5b52eb 100644 --- a/sherpa/CMakeLists.txt +++ b/sherpa/CMakeLists.txt @@ -2,3 +2,5 @@ add_subdirectory(csrc) add_subdirectory(python) add_subdirectory(cpp_api) + +install(DIRECTORY bin/ DESTINATION bin/) diff --git a/sherpa/bin/README.md b/sherpa/bin/README.md deleted file mode 100644 index 29dd68c06..000000000 --- a/sherpa/bin/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# File descriptions - -## pruned_transducer_statelessX - -Files in this part assume the model is from `pruned_transducer_statelessX` in -the folder -where `X>=2`. - -| Filename | Description | -|----------|-------------| -| [pruned_transducer_statelessX/offline_server.py](./pruned_transducer_statelessX/offline_server.py) | The server for offline ASR | -| [pruned_transducer_statelessX/offline_client.py](./pruned_transducer_statelessX/offline_client.py) | The client for offline ASR | -| [pruned_transducer_statelessX/decode_manifest.py](./pruned_transducer_statelessX/decode_manifest.py) | Demo for computing RTF and WER| - -If you want to test the offline server without training your own model, you -can download pretrained models on the LibriSpeech corpus by visiting -. -There you can find links to various pretrained models. - -For instance, you can use - -## pruned_stateless_emformer_rnnt2 - -Files in this part assume the model is from `pruned_stateless_emformer_rnnt2` in -the folder . - -| Filename | Description | -|----------|-------------| -| [pruned_stateless_emformer_rnnt2/streaming_server.py](./pruned_stateless_emformer_rnnt2/streaming_server.py) | The server for streaming ASR | -| [pruned_stateless_emformer_rnnt2/streaming_client.py](./pruned_stateless_emformer_rnnt2/streaming_client.py) | The client for streaming ASR | -| [pruned_stateless_emformer_rnnt2/decode.py](./pruned_stateless_emformer_rnnt2/decode.py) | Utilities for streaming ASR| - -You can use the pretrained model from - -to test it. - -## Streaming pruned_transducer_statelessX - -Files in this part assume the model is from `pruned_transducer_statelessX` in -the folder -where `X>=2`. And the model is trained for streaming recognition. - -| Filename | Description | -|----------|-------------| -| [streaming_pruned_transducer_statelessX/streaming_server.py](./streaming_pruned_transducer_statelessX/streaming_server.py) | The server for streaming ASR | -| [streaming_pruned_transducer_statelessX/streaming_client.py](./streaming_pruned_transducer_statelessX/streaming_client.py) | The client for streaming ASR | -| [streaming_pruned_transducer_statelessX/decode.py](./streaming_pruned_transducer_statelessX/decode.py) | Utilities for streaming ASR| - -You can use the pretrained model from - -to test it. diff --git a/triton/model_repo/decoder/1/.gitkeep b/sherpa/bin/__init__.py similarity index 100% rename from triton/model_repo/decoder/1/.gitkeep rename to sherpa/bin/__init__.py diff --git a/sherpa/bin/conv_emformer_transducer_stateless2/beam_search.py b/sherpa/bin/conv_emformer_transducer_stateless2/beam_search.py deleted file mode 100644 index 0b6d904ab..000000000 --- a/sherpa/bin/conv_emformer_transducer_stateless2/beam_search.py +++ /dev/null @@ -1,341 +0,0 @@ -from typing import List - -import k2 -import torch -from stream import Stream, stack_states, unstack_states - -from sherpa import ( - VALID_FAST_BEAM_SEARCH_METHOD, - Lexicon, - fast_beam_search_nbest, - fast_beam_search_nbest_LG, - fast_beam_search_one_best, - streaming_greedy_search, -) - - -class FastBeamSearch: - def __init__( - self, - beam_search_params: dict, - device: torch.device, - ): - """ - Args: - beam_search_params - Dictionary containing all the parameters for beam search. - device: - Device on which the computation will occur - """ - - decoding_method = beam_search_params["decoding_method"] - assert ( - decoding_method in VALID_FAST_BEAM_SEARCH_METHOD - ), f"{decoding_method} is not a valid search method" - - self.decoding_method = decoding_method - self.rnnt_decoding_config = k2.RnntDecodingConfig( - vocab_size=beam_search_params["vocab_size"], - decoder_history_len=beam_search_params["context_size"], - beam=beam_search_params["beam"], - max_states=beam_search_params["max_states"], - max_contexts=beam_search_params["max_contexts"], - ) - if decoding_method == "fast_beam_search_nbest_LG": - lexicon = Lexicon(beam_search_params["lang_dir"]) - self.word_table = lexicon.word_table - lg_filename = beam_search_params["lang_dir"] / "LG.pt" - self.decoding_graph = k2.Fsa.from_dict( - torch.load(lg_filename, map_location=device) - ) - self.decoding_graph.scores *= beam_search_params["ngram_lm_scale"] - else: - self.decoding_graph = k2.trivial_graph( - beam_search_params["vocab_size"] - 1, device - ) - self.device = device - self.context_size = beam_search_params["context_size"] - self.beam_search_params = beam_search_params - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - stream.rnnt_decoding_stream = k2.RnntDecodingStream(self.decoding_graph) - stream.hyp = [] - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do search with fast_beam_search - method. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyp` are - updated in-place. - """ - model = server.model - device = model.device - # Note: chunk_length is in frames before subsampling - chunk_length = server.chunk_length - batch_size = len(stream_list) - chunk_length_pad = server.chunk_length_pad - state_list, feature_list = [], [] - processed_frames_list, rnnt_decoding_streams_list = [], [] - - rnnt_decoding_config = self.rnnt_decoding_config - for s in stream_list: - rnnt_decoding_streams_list.append(s.rnnt_decoding_stream) - state_list.append(s.states) - processed_frames_list.append(s.processed_frames) - f = s.features[:chunk_length_pad] - s.features = s.features[chunk_length:] - s.processed_frames += chunk_length - - b = torch.cat(f, dim=0) - feature_list.append(b) - - features = torch.stack(feature_list, dim=0).to(device) - - states = stack_states(state_list) - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=device, - dtype=torch.int64, - ) - - num_processed_frames = torch.tensor( - processed_frames_list, - device=device, - ) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - num_processed_frames=num_processed_frames, - states=states, - ) - - processed_lens = (num_processed_frames >> 2) + encoder_out_lens - if self.decoding_method == "fast_beam_search_nbest": - next_hyp_list, next_trailing_blank_frames = fast_beam_search_nbest( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - num_paths=self.beam_search_params["num_paths"], - nbest_scale=self.beam_search_params["nbest_scale"], - use_double_scores=True, - temperature=self.beam_search_params["temperature"], - ) - elif self.decoding_method == "fast_beam_search_nbest_LG": - ( - next_hyp_list, - next_trailing_blank_frames, - ) = fast_beam_search_nbest_LG( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - num_paths=self.beam_search_params["num_paths"], - nbest_scale=self.beam_search_params["nbest_scale"], - use_double_scores=True, - temperature=self.beam_search_params["temperature"], - ) - elif self.decoding_method == "fast_beam_search": - ( - next_hyp_list, - next_trailing_blank_frames, - ) = fast_beam_search_one_best( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - ) - else: - raise NotImplementedError( - f"{self.decoding_method} is not implemented" - ) - - next_state_list = unstack_states(next_states) - for i, s in enumerate(stream_list): - s.states = next_state_list[i] - s.hyp = next_hyp_list[i] - s.num_trailing_blank_frames = next_trailing_blank_frames[i] - - def get_texts(self, stream: Stream) -> str: - """ - Return text after decoding - Args: - stream: - Stream to be processed. - """ - if self.decoding_method == "fast_beam_search_nbest_LG": - result = [self.word_table[i] for i in stream.hyp] - result = " ".join(result) - else: - result = self.sp.decode(stream.hyp) - - return result - - -class GreedySearch: - def __init__( - self, - model: "RnntConvEmformerModel", - beam_search_params: dict, - device: torch.device, - ): - """ - Args: - model: - RNN-T model decoder model - beam_search_params: - Dictionary containing all the parameters for beam search. - device: - Device on which the computation will occur - """ - self.device = device - self.beam_search_params = beam_search_params - self.device = device - - decoder_input = torch.tensor( - [ - [self.beam_search_params["blank_id"]] - * self.beam_search_params["context_size"] - ], - device=self.device, - dtype=torch.int64, - ) - - initial_decoder_out = model.decoder_forward(decoder_input) - self.initial_decoder_out = model.forward_decoder_proj( - initial_decoder_out.squeeze(1) - ) - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - stream.decoder_out = self.initial_decoder_out - stream.hyp = [ - self.beam_search_params["blank_id"] - ] * self.beam_search_params["context_size"] - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do search with greedy_search - method. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyp` are - updated in-place. - """ - model = server.model - device = model.device - # Note: chunk_length is in frames before subsampling - chunk_length = server.chunk_length - batch_size = len(stream_list) - chunk_length_pad = server.chunk_length_pad - state_list, feature_list = [], [] - decoder_out_list, hyp_list = [], [] - processed_frames_list = [] - num_trailing_blank_frames_list = [] - - for s in stream_list: - decoder_out_list.append(s.decoder_out) - hyp_list.append(s.hyp) - state_list.append(s.states) - processed_frames_list.append(s.processed_frames) - f = s.features[:chunk_length_pad] - s.features = s.features[chunk_length:] - s.processed_frames += chunk_length - - b = torch.cat(f, dim=0) - feature_list.append(b) - - num_trailing_blank_frames_list.append(s.num_trailing_blank_frames) - - features = torch.stack(feature_list, dim=0).to(device) - states = stack_states(state_list) - decoder_out = torch.cat(decoder_out_list, dim=0) - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=device, - dtype=torch.int64, - ) - - num_processed_frames = torch.tensor( - processed_frames_list, - device=device, - ) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - num_processed_frames=num_processed_frames, - states=states, - ) - - # Note: It does not return the next_encoder_out_len since - # there are no paddings for streaming ASR. Each stream - # has the same input number of frames, i.e., server.chunk_length. - ( - next_decoder_out, - next_hyp_list, - next_trailing_blank_frames, - ) = streaming_greedy_search( - model=model, - encoder_out=encoder_out, - decoder_out=decoder_out, - hyps=hyp_list, - num_trailing_blank_frames=num_trailing_blank_frames_list, - ) - - next_decoder_out_list = next_decoder_out.split(1) - - next_state_list = unstack_states(next_states) - for i, s in enumerate(stream_list): - s.states = next_state_list[i] - s.decoder_out = next_decoder_out_list[i] - s.hyp = next_hyp_list[i] - s.num_trailing_blank_frames = next_trailing_blank_frames[i] - - def get_texts(self, stream: Stream) -> str: - """ - Return text after decoding - Args: - stream: - Stream to be processed. - """ - return self.sp.decode( - stream.hyp[self.beam_search_params["context_size"] :] - ) diff --git a/sherpa/bin/conv_emformer_transducer_stateless2/stream.py b/sherpa/bin/conv_emformer_transducer_stateless2/stream.py deleted file mode 100644 index ff9ef315d..000000000 --- a/sherpa/bin/conv_emformer_transducer_stateless2/stream.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang -# Zengwei Yao) -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import List, Tuple - -import torch -from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature - -import sherpa - - -def unstack_states( - states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] -) -> List[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]]: - """Unstack the emformer state corresponding to a batch of utterances - into a list of states, where the i-th entry is the state from the i-th - utterance in the batch. - - Args: - states: - A tuple of 2 elements. - ``states[0]`` is the attention caches of a batch of utterance. - ``states[1]`` is the convolution caches of a batch of utterance. - ``len(states[0])`` and ``len(states[1])`` both eqaul to number of layers. # noqa - - Returns: - A list of states. - ``states[i]`` is a tuple of 2 elements of i-th utterance. - ``states[i][0]`` is the attention caches of i-th utterance. - ``states[i][1]`` is the convolution caches of i-th utterance. - ``len(states[i][0])`` and ``len(states[i][1])`` both eqaul to number of layers. # noqa - """ - - attn_caches, conv_caches = states - batch_size = conv_caches[0].size(0) - num_layers = len(attn_caches) - - list_attn_caches = [None] * batch_size - for i in range(batch_size): - list_attn_caches[i] = [[] for _ in range(num_layers)] - for li, layer in enumerate(attn_caches): - for s in layer: - s_list = s.unbind(dim=1) - for bi, b in enumerate(list_attn_caches): - b[li].append(s_list[bi]) - - list_conv_caches = [None] * batch_size - for i in range(batch_size): - list_conv_caches[i] = [None] * num_layers - for li, layer in enumerate(conv_caches): - c_list = layer.unbind(dim=0) - for bi, b in enumerate(list_conv_caches): - b[li] = c_list[bi] - - ans = [None] * batch_size - for i in range(batch_size): - ans[i] = [list_attn_caches[i], list_conv_caches[i]] - - return ans - - -def stack_states( - state_list: List[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]] -) -> Tuple[List[List[torch.Tensor]], List[torch.Tensor]]: - """Stack list of emformer states that correspond to separate utterances - into a single emformer state so that it can be used as an input for - emformer when those utterances are formed into a batch. - - Note: - It is the inverse of :func:`unstack_states`. - - Args: - state_list: - Each element in state_list corresponding to the internal state - of the emformer model for a single utterance. - ``states[i]`` is a tuple of 2 elements of i-th utterance. - ``states[i][0]`` is the attention caches of i-th utterance. - ``states[i][1]`` is the convolution caches of i-th utterance. - ``len(states[i][0])`` and ``len(states[i][1])`` both eqaul to number of layers. # noqa - - Returns: - A new state corresponding to a batch of utterances. - See the input argument of :func:`unstack_states` for the meaning - of the returned tensor. - """ - batch_size = len(state_list) - - attn_caches = [] - for layer in state_list[0][0]: - if batch_size > 1: - # Note: We will stack attn_caches[layer][s][] later to get attn_caches[layer][s] # noqa - attn_caches.append([[s] for s in layer]) - else: - attn_caches.append([s.unsqueeze(1) for s in layer]) - for b, states in enumerate(state_list[1:], 1): - for li, layer in enumerate(states[0]): - for si, s in enumerate(layer): - attn_caches[li][si].append(s) - if b == batch_size - 1: - attn_caches[li][si] = torch.stack( - attn_caches[li][si], - dim=1, - ) - - conv_caches = [] - for layer in state_list[0][1]: - if batch_size > 1: - # Note: We will stack conv_caches[layer][] later to get conv_caches[layer] # noqa - conv_caches.append([layer]) - else: - conv_caches.append(layer.unsqueeze(0)) - for b, states in enumerate(state_list[1:], 1): - for li, layer in enumerate(states[1]): - conv_caches[li].append(layer) - if b == batch_size - 1: - conv_caches[li] = torch.stack(conv_caches[li], dim=0) - - return [attn_caches, conv_caches] - - -def _create_streaming_feature_extractor() -> OnlineFeature: - """Create a CPU streaming feature extractor. - - At present, we assume it returns a fbank feature extractor with - fixed options. In the future, we will support passing in the options - from outside. - - Returns: - Return a CPU streaming feature extractor. - """ - opts = FbankOptions() - opts.device = "cpu" - opts.frame_opts.dither = 0 - opts.frame_opts.snip_edges = False - opts.frame_opts.samp_freq = 16000 - opts.mel_opts.num_bins = 80 - return OnlineFbank(opts) - - -class Stream(object): - def __init__( - self, - context_size: int, - subsampling_factor: int, - initial_states: List[List[torch.Tensor]], - ) -> None: - """ - Args: - context_size: - Context size of the RNN-T decoder model. - subsampling_factor: - Subsampling factor of the RNN-T encoder model. - initial_states: - The initial states of the Emformer model. Note that the state - does not contain the batch dimension. - """ - self.feature_extractor = _create_streaming_feature_extractor() - # It contains a list of 2-D tensors representing the feature frames. - # Each entry is of shape (1, feature_dim) - self.features: List[torch.Tensor] = [] - self.num_fetched_frames = 0 # before subsampling - self.num_trailing_blank_frames = 0 # after subsampling - - self.states = initial_states - self.processed_frames = 0 # before subsampling - self.context_size = context_size - self.subsampling_factor = subsampling_factor - self.log_eps = math.log(1e-10) - - # whenever an endpoint is detected, it is incremented - self.segment = 0 - - def accept_waveform( - self, - sampling_rate: float, - waveform: torch.Tensor, - ) -> None: - """Feed audio samples to the feature extractor and compute features - if there are enough samples available. - - Caution: - The range of the audio samples should match the one used in the - training. That is, if you use the range [-1, 1] in the training, then - the input audio samples should also be normalized to [-1, 1]. - - Args - sampling_rate: - The sampling rate of the input audio samples. It is used for sanity - check to ensure that the input sampling rate equals to the one - used in the extractor. If they are not equal, then no resampling - will be performed; instead an error will be thrown. - waveform: - A 1-D torch tensor of dtype torch.float32 containing audio samples. - It should be on CPU. - """ - self.feature_extractor.accept_waveform( - sampling_rate=sampling_rate, - waveform=waveform, - ) - self._fetch_frames() - - def input_finished(self) -> None: - """Signal that no more audio samples available and the feature - extractor should flush the buffered samples to compute frames. - """ - self.feature_extractor.input_finished() - self._fetch_frames() - - def _fetch_frames(self) -> None: - """Fetch frames from the feature extractor""" - while self.num_fetched_frames < self.feature_extractor.num_frames_ready: - frame = self.feature_extractor.get_frame(self.num_fetched_frames) - self.features.append(frame) - self.num_fetched_frames += 1 - - def add_tail_paddings(self, n: int = 20) -> None: - """Add some tail paddings so that we have enough context to process - frames at the very end of an utterance. - - Args: - n: - Number of tail padding frames to be added. You can increase it if - it happens that there are many missing tokens for the last word of - an utterance. - """ - tail_padding = torch.full( - (1, self.feature_extractor.opts.mel_opts.num_bins), - fill_value=self.log_eps, - dtype=torch.float32, - ) - - self.features += [tail_padding] * n - - def endpoint_detected( - self, - config: sherpa.OnlineEndpointConfig, - ) -> bool: - """ - Args: - config: - Config for endpointing. - Returns: - Return True if endpoint is detected; return False otherwise. - """ - frame_shift_in_seconds = ( - self.feature_extractor.opts.frame_opts.frame_shift_ms / 1000 - ) - - trailing_silence_frames = ( - self.num_trailing_blank_frames * self.subsampling_factor - ) - - detected = sherpa.endpoint_detected( - config=config, - num_frames_decoded=self.processed_frames, - trailing_silence_frames=trailing_silence_frames, - frame_shift_in_seconds=frame_shift_in_seconds, - ) - if detected: - self.num_trailing_blank_frames = 0 - self.processed_frames = 0 - self.segment += 1 - - return detected diff --git a/sherpa/bin/conv_emformer_transducer_stateless2/streaming_client.py b/sherpa/bin/conv_emformer_transducer_stateless2/streaming_client.py deleted file mode 120000 index 7bb0611a8..000000000 --- a/sherpa/bin/conv_emformer_transducer_stateless2/streaming_client.py +++ /dev/null @@ -1 +0,0 @@ -../pruned_stateless_emformer_rnnt2/streaming_client.py \ No newline at end of file diff --git a/sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py b/sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py deleted file mode 100755 index ae6d08b99..000000000 --- a/sherpa/bin/conv_emformer_transducer_stateless2/streaming_server.py +++ /dev/null @@ -1,595 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A server for streaming ASR recognition. By streaming it means the audio samples -are coming in real-time. You don't need to wait until all audio samples are -captured before sending them for recognition. - -It supports multiple clients sending at the same time. - -Usage: - ./streaming_server.py --help - - ./streaming_server.py - -Please refer to -https://k2-fsa.github.io/sherpa/python/streaming_asr/conv_emformer/index.html -for details. -""" - -import argparse -import asyncio -import http -import json -import logging -import math -import warnings -from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Tuple - -import numpy as np -import sentencepiece as spm -import torch -import websockets -from beam_search import FastBeamSearch, GreedySearch -from stream import Stream - -from sherpa import ( - OnlineEndpointConfig, - RnntConvEmformerModel, - add_beam_search_arguments, - add_online_endpoint_arguments, -) - - -def get_args(): - beam_search_parser = add_beam_search_arguments() - online_endpoint_parser = add_online_endpoint_arguments() - parser = argparse.ArgumentParser( - parents=[beam_search_parser, online_endpoint_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--port", - type=int, - default=6006, - help="The server will listen on this port", - ) - - parser.add_argument( - "--nn-model-filename", - type=str, - required=True, - help="""The torchscript model. You can use - icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \ - --jit=1 - to generate this model. - """, - ) - - parser.add_argument( - "--bpe-model-filename", - type=str, - help="""The BPE model - You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx - where xxx is the number of BPE tokens you used to train the model. - Note: You don't need to provide it if you provide `--token-filename`. - """, - ) - - parser.add_argument( - "--token-filename", - type=str, - help="""Filename for tokens.txt - For instance, you can find it in the directory - egs/aishell/ASR/data/lang_char/tokens.txt - or - egs/wenetspeech/ASR/data/lang_char/tokens.txt - from icefall - Note: You don't need to provide it if you provide `--bpe-model` - """, - ) - - parser.add_argument( - "--decode-chunk-size", - type=int, - default=8, - help="The chunk size for decoding (in frames after subsampling)", - ) - - parser.add_argument( - "--decode-left-context", - type=int, - default=32, - help="""left context can be seen during decoding - (in frames after subsampling)""", - ) - - parser.add_argument( - "--decode-right-context", - type=int, - default=2, - help="""right context can be seen during decoding - (in frames after subsampling)""", - ) - - parser.add_argument( - "--nn-pool-size", - type=int, - default=1, - help="Number of threads for NN computation and decoding.", - ) - - parser.add_argument( - "--max-batch-size", - type=int, - default=50, - help="""Max batch size for computation. Note if there are not enough - requests in the queue, it will wait for max_wait_ms time. After that, - even if there are not enough requests, it still sends the - available requests in the queue for computation. - """, - ) - - parser.add_argument( - "--max-wait-ms", - type=float, - default=10, - help="""Max time in millisecond to wait to build batches for inference. - If there are not enough requests in the stream queue to build a batch - of max_batch_size, it waits up to this time before fetching available - requests for computation. - """, - ) - - parser.add_argument( - "--max-message-size", - type=int, - default=(1 << 20), - help="""Max message size in bytes. - The max size per message cannot exceed this limit. - """, - ) - - parser.add_argument( - "--max-queue-size", - type=int, - default=32, - help="Max number of messages in the queue for each connection.", - ) - - parser.add_argument( - "--max-active-connections", - type=int, - default=500, - help="""Maximum number of active connections. The server will refuse - to accept new connections once the current number of active connections - equals to this limit. - """, - ) - - return ( - parser.parse_args(), - beam_search_parser.parse_known_args()[0], - online_endpoint_parser.parse_known_args()[0], - ) - - -class StreamingServer(object): - def __init__( - self, - nn_model_filename: str, - bpe_model_filename: str, - nn_pool_size: int, - max_wait_ms: float, - max_batch_size: int, - max_message_size: int, - max_queue_size: int, - max_active_connections: int, - beam_search_params: dict, - online_endpoint_config: OnlineEndpointConfig, - ): - """ - Args: - nn_model_filename: - Path to the torchscript model - bpe_model_filename: - Path to the BPE model - nn_pool_size: - Number of threads for the thread pool that is responsible for - neural network computation and decoding. - max_wait_ms: - Max wait time in milliseconds in order to build a batch of - `batch_size`. - max_batch_size: - Max batch size for inference. - max_message_size: - Max size in bytes per message. - max_queue_size: - Max number of messages in the queue for each connection. - max_active_connections: - Max number of active connections. Once number of active client - equals to this limit, the server refuses to accept new connections. - beam_search_params: - Dictionary containing all the parameters for beam search. - online_endpoint_config: - Config for endpointing. - """ - if torch.cuda.is_available(): - device = torch.device("cuda", 0) - else: - device = torch.device("cpu") - logging.info(f"Using device: {device}") - - self.model = RnntConvEmformerModel(nn_model_filename, device=device) - - # number of frames before subsampling - self.chunk_length = self.model.chunk_length - - self.right_context_length = self.model.right_context_length - - # We add 3 here since the subsampling method is using - # ((len - 1) // 2 - 1) // 2) - self.chunk_length_pad = self.chunk_length + self.model.pad_length - - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model_filename) - - self.context_size = self.model.context_size - self.subsampling_factor = self.model.subsampling_factor - self.blank_id = self.model.blank_id - self.vocab_size = self.model.vocab_size - self.log_eps = math.log(1e-10) - - self.initial_states = self.model.get_encoder_init_states() - - # Add these params after loading the Conv-Emformer model - beam_search_params["vocab_size"] = self.vocab_size - beam_search_params["context_size"] = self.context_size - beam_search_params["blank_id"] = self.blank_id - - decoding_method = beam_search_params["decoding_method"] - if decoding_method.startswith("fast_beam_search"): - self.beam_search = FastBeamSearch( - beam_search_params=beam_search_params, - device=device, - ) - elif decoding_method == "greedy_search": - self.beam_search = GreedySearch( - self.model, - beam_search_params, - device, - ) - else: - raise ValueError( - f"Decoding method {decoding_method} is not supported." - ) - - self.beam_search.sp = self.sp - - self.online_endpoint_config = online_endpoint_config - - self.nn_pool = ThreadPoolExecutor( - max_workers=nn_pool_size, - thread_name_prefix="nn", - ) - - self.stream_queue = asyncio.Queue() - self.max_wait_ms = max_wait_ms - self.max_batch_size = max_batch_size - self.max_message_size = max_message_size - self.max_queue_size = max_queue_size - self.max_active_connections = max_active_connections - - self.current_active_connections = 0 - - async def warmup(self) -> None: - """Do warmup to the torchscript model to decrease the waiting time - of the first request. - - See https://github.com/k2-fsa/sherpa/pull/100 for details - """ - logging.info("Warmup start") - stream = Stream( - context_size=self.context_size, - subsampling_factor=self.subsampling_factor, - initial_states=self.initial_states, - ) - self.beam_search.init_stream(stream) - - samples = torch.rand(16000 * 1, dtype=torch.float32) # 1 second - stream.accept_waveform(sampling_rate=16000, waveform=samples) - - while len(stream.features) > self.chunk_length_pad: - await self.compute_and_decode(stream) - - logging.info("Warmup done") - - async def stream_consumer_task(self): - """This function extracts streams from the queue, batches them up, sends - them to the RNN-T model for computation and decoding. - """ - while True: - if self.stream_queue.empty(): - await asyncio.sleep(self.max_wait_ms / 1000) - continue - - batch = [] - try: - while len(batch) < self.max_batch_size: - item = self.stream_queue.get_nowait() - - assert len(item[0].features) >= self.chunk_length_pad, len( - item[0].features - ) - - batch.append(item) - except asyncio.QueueEmpty: - pass - stream_list = [b[0] for b in batch] - future_list = [b[1] for b in batch] - - loop = asyncio.get_running_loop() - await loop.run_in_executor( - self.nn_pool, - self.beam_search.process, - self, - stream_list, - ) - - for f in future_list: - self.stream_queue.task_done() - f.set_result(None) - - async def compute_and_decode( - self, - stream: Stream, - ) -> None: - """Put the stream into the queue and wait it to be processed by the - consumer task. - - Args: - stream: - The stream to be processed. Note: It is changed in-place. - """ - loop = asyncio.get_running_loop() - future = loop.create_future() - await self.stream_queue.put((stream, future)) - await future - - async def process_request( - self, - unused_path: str, - unused_request_headers: websockets.Headers, - ) -> Optional[Tuple[http.HTTPStatus, websockets.Headers, bytes]]: - if self.current_active_connections < self.max_active_connections: - self.current_active_connections += 1 - return None - - # Refuse new connections - status = http.HTTPStatus.SERVICE_UNAVAILABLE # 503 - header = {"Hint": "The server is overloaded. Please retry later."} - response = b"The server is busy. Please retry later." - - return status, header, response - - async def run(self, port: int): - task = asyncio.create_task(self.stream_consumer_task()) - await self.warmup() - - async with websockets.serve( - self.handle_connection, - host="", - port=port, - max_size=self.max_message_size, - max_queue=self.max_queue_size, - process_request=self.process_request, - ): - await asyncio.Future() # run forever - - await task # not reachable - - async def handle_connection( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and send - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - try: - await self.handle_connection_impl(socket) - finally: - # Decrement so that it can accept new connections - self.current_active_connections -= 1 - - logging.info( - f"Disconnected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - - async def handle_connection_impl( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and send - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - logging.info( - f"Connected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - - stream = Stream( - context_size=self.context_size, - initial_states=self.initial_states, - subsampling_factor=self.subsampling_factor, - ) - - self.beam_search.init_stream(stream) - - while True: - samples = await self.recv_audio_samples(socket) - if samples is None: - break - - # TODO(fangjun): At present, we assume the sampling rate - # of the received audio samples is always 16000. - stream.accept_waveform(sampling_rate=16000, waveform=samples) - - while len(stream.features) > self.chunk_length_pad: - await self.compute_and_decode(stream) - hyp = self.beam_search.get_texts(stream) - - segment = stream.segment - is_final = stream.endpoint_detected(self.online_endpoint_config) - if is_final: - self.beam_search.init_stream(stream) - - message = { - "segment": segment, - "text": hyp, - "final": is_final, - } - - await socket.send(json.dumps(message)) - - stream.input_finished() - while len(stream.features) > self.chunk_length_pad: - await self.compute_and_decode(stream) - - if len(stream.features) > 0: - n = self.chunk_length_pad - len(stream.features) - stream.add_tail_paddings(n) - await self.compute_and_decode(stream) - stream.features = [] - - hyp = self.beam_search.get_texts(stream) - - message = { - "segment": stream.segment, - "text": hyp, - "final": True, # end of connection, always set final to True - } - - await socket.send(json.dumps(message)) - - async def recv_audio_samples( - self, - socket: websockets.WebSocketServerProtocol, - ) -> Optional[torch.Tensor]: - """Receives a tensor from the client. - - Each message contains either a bytes buffer containing audio samples - in 16 kHz or contains b"Done" meaning the end of utterance. - - Args: - socket: - The socket for communicating with the client. - Returns: - Return a 1-D torch.float32 tensor containing the audio samples or - return None. - """ - message = await socket.recv() - if message == b"Done": - return None - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # PyTorch warns that the underlying buffer is not writable. - # We ignore it here as we are not going to write it anyway. - if hasattr(torch, "frombuffer"): - # Note: torch.frombuffer is available only in torch>= 1.10 - return torch.frombuffer(message, dtype=torch.float32) - else: - array = np.frombuffer(message, dtype=np.float32) - return torch.from_numpy(array) - - -@torch.no_grad() -def main(): - args, beam_search_parser, online_endpoint_parser = get_args() - - beam_search_params = vars(beam_search_parser) - logging.info(beam_search_params) - - online_endpoint_params = vars(online_endpoint_parser) - logging.info(online_endpoint_params) - - online_endpoint_config = OnlineEndpointConfig.from_args( - online_endpoint_params - ) - - logging.info(vars(args)) - - port = args.port - nn_model_filename = args.nn_model_filename - bpe_model_filename = args.bpe_model_filename - nn_pool_size = args.nn_pool_size - max_batch_size = args.max_batch_size - max_wait_ms = args.max_wait_ms - max_message_size = args.max_message_size - max_queue_size = args.max_queue_size - max_active_connections = args.max_active_connections - - server = StreamingServer( - nn_model_filename=nn_model_filename, - bpe_model_filename=bpe_model_filename, - nn_pool_size=nn_pool_size, - max_batch_size=max_batch_size, - max_wait_ms=max_wait_ms, - max_message_size=max_message_size, - max_queue_size=max_queue_size, - max_active_connections=max_active_connections, - beam_search_params=beam_search_params, - online_endpoint_config=online_endpoint_config, - ) - asyncio.run(server.run(port)) - - -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - -# See https://github.com/pytorch/pytorch/issues/38342 -# and https://github.com/pytorch/pytorch/issues/33354 -# -# If we don't do this, the delay increases whenever there is -# a new request that changes the actual batch size. -# If you use `py-spy dump --pid --native`, you will -# see a lot of time is spent in re-compiling the torch script model. -torch._C._jit_set_profiling_executor(False) -torch._C._jit_set_profiling_mode(False) -torch._C._set_graph_executor_optimize(False) -""" -// Use the following in C++ -torch::jit::getExecutorMode() = false; -torch::jit::getProfilingMode() = false; -torch::jit::setGraphExecutorOptimize(false); -""" - -if __name__ == "__main__": - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa - logging.basicConfig(format=formatter, level=logging.INFO) - main() diff --git a/sherpa/bin/pruned_transducer_statelessX/decode_manifest.py b/sherpa/bin/decode_manifest.py similarity index 78% rename from sherpa/bin/pruned_transducer_statelessX/decode_manifest.py rename to sherpa/bin/decode_manifest.py index 6fc0840f8..4c8499a90 100755 --- a/sherpa/bin/pruned_transducer_statelessX/decode_manifest.py +++ b/sherpa/bin/decode_manifest.py @@ -1,19 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright 2022-2023 Xiaomi Corp. + """ This script loads a manifest in lhotse format and sends it to the server for decoding, in parallel. @@ -110,13 +97,13 @@ async def send( samples = c.load_audio().reshape(-1).astype(np.float32) num_bytes = samples.nbytes - await websocket.send((num_bytes).to_bytes(8, "little", signed=True)) + await websocket.send((num_bytes).to_bytes(4, "little", signed=True)) frame_size = (2 ** 20) // 4 # max payload is 1MB start = 0 while start < samples.size: end = start + frame_size - await websocket.send(samples.data[start:end]) + await websocket.send(samples.data[start:end].tobytes()) start = end decoding_results = await websocket.recv() @@ -127,12 +114,16 @@ async def send( hyp = decoding_results.split() ref = list("".join(ref)) hyp = list("".join(hyp)) - results.append((ref, hyp)) + results.append((c.id, ref, hyp)) else: results.append( - (c.supervisions[0].text.split(), decoding_results.split()) + ( + c.id, + c.supervisions[0].text.split(), + decoding_results.split(), + ) ) # noqa - await websocket.send(b"Done") + await websocket.send("Done") return total_duration, results @@ -177,16 +168,20 @@ async def main(): rtf = elapsed / total_duration - print(f"RTF: {rtf:.4f}") - print( - f"total_duration: {total_duration:.3f} seconds " - f"({total_duration/3600:.2f} hours)" + s = f"RTF: {rtf:.4f}\n" + s += f"total_duration: {total_duration:.3f} seconds\n" + s += f"({total_duration/3600:.2f} hours)\n" + s += ( + f"processing time: {elapsed:.3f} seconds " + f"({elapsed/3600:.2f} hours)\n" ) - print( - f"processing time: {elapsed:.3f} seconds " f"({elapsed/3600:.2f} hours)" - ) # noqa + print(s) + + with open("rtf.txt", "w") as f: + f.write(s) name = Path(filename).stem.split(".")[0] + results = sorted(results) store_transcripts(filename=f"recogs-{name}.txt", texts=results) with open(f"errs-{name}.txt", "w") as f: diff --git a/sherpa/bin/pruned_transducer_statelessX/offline_client.py b/sherpa/bin/offline_client.py similarity index 65% rename from sherpa/bin/pruned_transducer_statelessX/offline_client.py rename to sherpa/bin/offline_client.py index 4e8e960c3..fe2e52073 100755 --- a/sherpa/bin/pruned_transducer_statelessX/offline_client.py +++ b/sherpa/bin/offline_client.py @@ -1,22 +1,7 @@ #!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# Copyright 2022-2023 Xiaomi Corp. """ -A client for offline ASR recognition. +A client for offline ASR. Usage: ./offline_client.py \ @@ -25,7 +10,8 @@ /path/to/foo.wav \ /path/to/bar.wav -(Note: You have to first start the server before starting the client) +Note: You have to first start the server before starting the client. +You can use either ./offline_transducer_server.py or ./offline_ctc_server.py """ import argparse import asyncio @@ -62,8 +48,10 @@ def get_args(): nargs="+", help="The input sound file(s) to transcribe. " "Supported formats are those supported by torchaudio.load(). " - "For example, wav and flac are supported. " - "The sample rate has to be 16kHz.", + "For example, wav and flac are supported. All models from icefall " + "uses 16 kHz training data. If the input sound file has a sample rate " + "different from 16 kHz, it is resampled to 16 kHz. " + "Only the first channel is used.", ) return parser.parse_args() @@ -76,18 +64,36 @@ async def run(server_addr: str, server_port: int, test_wavs: List[str]): for test_wav in test_wavs: logging.info(f"Sending {test_wav}") wave, sample_rate = torchaudio.load(test_wav) - assert sample_rate == 16000, sample_rate - wave = wave.squeeze(0) + if sample_rate != 16000: + wave = torchaudio.functional.resample( + wave, + orig_freq=sample_rate, + new_freq=16000, + ) + sample_rate = 16000 + + wave = wave.squeeze(0).contiguous() + + # wave is a 1-D float32 tensor normalized to the range [-1, 1] + # The format of the message sent to the server for each wave is + # + # - 4-byte in little endian specifying number of subsequent bytes + # to send + # - one or more messages containing the data + # - The last message is "Done" + num_bytes = wave.numel() * wave.element_size() - await websocket.send((num_bytes).to_bytes(8, "little", signed=True)) + await websocket.send((num_bytes).to_bytes(4, "little", signed=True)) frame_size = (2 ** 20) // 4 # max payload is 1MB sleep_time = 0.25 start = 0 while start < wave.numel(): end = start + frame_size - d = wave.numpy().data[start:end] + + # reinterpret floats to bytes + d = wave.numpy().data[start:end].tobytes() await websocket.send(d) await asyncio.sleep(sleep_time) # in seconds @@ -95,8 +101,10 @@ async def run(server_addr: str, server_port: int, test_wavs: List[str]): start = end decoding_results = await websocket.recv() + if decoding_results == "": + decoding_results = "" logging.info(f"{test_wav}\n{decoding_results}") - await websocket.send(b"Done") + await websocket.send("Done") async def main(): diff --git a/sherpa/bin/offline_ctc_asr.py b/sherpa/bin/offline_ctc_asr.py new file mode 100755 index 000000000..d2ba53265 --- /dev/null +++ b/sherpa/bin/offline_ctc_asr.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023 Xiaomi Corporation + +""" +A standalone script for offline (i.e., non-streaming) speech recognition. + +This file decodes files without the need to start a server and a client. + +Please refer to +https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_ctc.html +for pre-trained models to download. + +Usage: +(1) Use icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 +cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 + +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/tokens.txt" +git lfs pull --include "data/lang_bpe_500/HLG.pt" + +cd /path/to/sherpa + +(a) Decoding with H + +./sherpa/bin/offline_ctc_asr.py \ + --nn-model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \ + --tokens ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/tokens.txt \ + --use-gpu false \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + +(b) Decoding with HLG + +./sherpa/bin/offline_ctc_asr.py \ + --nn-model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \ + --tokens ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/tokens.txt \ + --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \ + --lm-scale 0.9 \ + --use-gpu false \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav + +(2) Use wenet-english-model + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wenet-english-model +cd wenet-english-model +git lfs pull --include "final.zip" + +cd /path/to/sherpa + +./sherpa/bin/offline_ctc_asr.py \ + --nn-model ./wenet-english-model/final.zip \ + --tokens ./wenet-english-model/units.txt \ + --use-gpu false \ + --normalize-samples false \ + ./wenet-english-model/test_wavs/1089-134686-0001.wav \ + ./wenet-english-model/test_wavs/1221-135766-0001.wav \ + ./wenet-english-model/test_wavs/1221-135766-0002.wav + +(3) Use wav2vec2.0-torchaudio + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/wav2vec2.0-torchaudio +cd wav2vec2.0-torchaudio +git lfs pull --include "wav2vec2_asr_base_10m.pt" + +cd /path/to/sherpa + +./sherpa/bin/offline_ctc_asr.py \ + --nn-model ./wav2vec2.0-torchaudio/wav2vec2_asr_base_10m.pt \ + --tokens ./wav2vec2.0-torchaudio/tokens.txt \ + --use-gpu false \ + ./wav2vec2.0-torchaudio/test_wavs/1089-134686-0001.wav \ + ./wav2vec2.0-torchaudio/test_wavs/1221-135766-0001.wav \ + ./wav2vec2.0-torchaudio/test_wavs/1221-135766-0002.wav + +(4) Use NeMo CTC models + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-citrinet-512 +cd sherpa-nemo-ctc-en-citrinet-512 +git lfs pull --include "model.pt" + +cd /path/to/sherpa + +./sherpa/bin/offline_ctc_asr.py \ + --nn-model ./sherpa-nemo-ctc-en-citrinet-512/model.pt + --tokens ./sherpa-nemo-ctc-en-citrinet-512/tokens.txt \ + --use-gpu false \ + --nemo-normalize per_feature \ + ./sherpa-nemo-ctc-en-citrinet-512/test_wavs/0.wav \ + ./sherpa-nemo-ctc-en-citrinet-512/test_wavs/1.wav \ + ./sherpa-nemo-ctc-en-citrinet-512/test_wavs/2.wav +""" # noqa +import argparse +import logging +from pathlib import Path +from typing import List + +import torch +import torchaudio + +import sherpa +from sherpa import str2bool + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + add_model_args(parser) + add_decoding_args(parser) + + parser.add_argument( + "--use-gpu", + type=str2bool, + default=False, + help="""True to use GPU. It always selects GPU 0. You can use the + environement variable CUDA_VISIBLE_DEVICES to control which GPU + is mapped to GPU 0. + """, + ) + + parser.add_argument( + "--normalize-samples", + type=str2bool, + default=True, + help="""If your model was trained using features computed + from samples in the range `[-32768, 32767]`, then please set + this flag to False. For instance, if you use models from WeNet, + please set it to False. + """, + ) + + parser.add_argument( + "--nemo-normalize", + type=str, + default="", + help="""Used only for models from NeMo. + Leave it to empty if the preprocessor of the model does not use + normalization. Current supported value is "per_feature". + """, + ) + + parser.add_argument( + "sound_files", + type=str, + nargs="+", + help="The input sound file(s) to transcribe. " + "Supported formats are those supported by torchaudio.load(). " + "For example, wav and flac are supported. ", + ) + + return parser + + +def add_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--nn-model", + type=str, + help="""The torchscript model. Please refer to + https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_ctc/index.html + for a list of pre-trained models to download. + """, + ) + + parser.add_argument( + "--tokens", + type=str, + help="Path to tokens.txt", + ) + + +def add_decoding_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--HLG", + type=str, + help="""Optional. If empty, we use an H graph for decoding. + If not empty, it is the filename of HLG.pt and we will + use it for decoding""", + ) + + parser.add_argument( + "--lm-scale", + type=float, + default=1.0, + help=""" + Used only when --HLG is not empty. It specifies the scale + for HLG.scores + """, + ) + + parser.add_argument( + "--modified", + type=bool, + default=True, + help="""Used only when --HLG is empty. True to use a modified + CTC topology. False to use a standard CTC topology. + Please refer to https://k2-fsa.github.io/k2/python_api/api.html#ctc-topo + for the differences between standard and modified CTC topology. + """, + ) + + parser.add_argument( + "--search-beam", + type=float, + default=20.0, + help="""Decoding beam, e.g. 20. Smaller is faster, larger is + more exact (less pruning). This is the default value; + it may be modified by `min_active_states` and + `max_active_states`. + """, + ) + + parser.add_argument( + "--output-beam", + type=float, + default=8.0, + help="""Beam to prune output, similar to lattice-beam in Kaldi. + Relative to the best path of output. + """, + ) + + parser.add_argument( + "--min-active-states", + type=int, + default=30, + help="""Minimum number of FSA states that are allowed to + be active on any given frame for any given + intersection/composition task. This is advisory, + in that it will try not to have fewer than this + number active. Set it to zero if there is no + constraint.""", + ) + + parser.add_argument( + "--max-active-states", + type=int, + default=10000, + help="""Maximum number of FSA states that are allowed to + be active on any given frame for any given + intersection/composition task. This is advisory, + in that it will try not to exceed that but may + not always succeed. You can use a very large + number if no constraint is needed.""", + ) + + +def check_args(args): + if not Path(args.nn_model).is_file(): + raise ValueError(f"{args.nn_model} does not exist") + + if not Path(args.tokens).is_file(): + raise ValueError(f"{args.tokens} does not exist") + + if args.HLG: + assert Path(args.HLG).is_file(), f"{args.HLG} does not exist" + + assert len(args.sound_files) > 0, args.sound_files + for f in args.sound_files: + if not Path(f).is_file(): + raise ValueError(f"{f} does not exist") + + +def read_sound_files( + filenames: List[str], expected_sample_rate: float +) -> List[torch.Tensor]: + """Read a list of sound files into a list 1-D float32 torch tensors. + Args: + filenames: + A list of sound filenames. + expected_sample_rate: + The expected sample rate of the sound files. + Returns: + Return a list of 1-D float32 torch tensors. + """ + ans = [] + for f in filenames: + wave, sample_rate = torchaudio.load(f) + if sample_rate != expected_sample_rate: + wave = torchaudio.functional.resample( + wave, + orig_freq=sample_rate, + new_freq=expected_sample_rate, + ) + + # We use only the first channel + ans.append(wave[0].contiguous()) + return ans + + +def create_recognizer(args): + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + feat_config.normalize_samples = args.normalize_samples + feat_config.nemo_normalize = args.nemo_normalize + + ctc_decoder_config = sherpa.OfflineCtcDecoderConfig( + hlg=args.HLG if args.HLG else "", + lm_scale=args.lm_scale, + modified=args.modified, + search_beam=args.search_beam, + output_beam=args.output_beam, + min_active_states=args.min_active_states, + max_active_states=args.max_active_states, + ) + + config = sherpa.OfflineRecognizerConfig( + nn_model=args.nn_model, + tokens=args.tokens, + use_gpu=args.use_gpu, + feat_config=feat_config, + ctc_decoder_config=ctc_decoder_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +def main(): + args = get_parser().parse_args() + logging.info(vars(args)) + check_args(args) + + recognizer = create_recognizer(args) + sample_rate = 16000 + + samples: List[torch.Tensor] = read_sound_files( + args.sound_files, + sample_rate, + ) + + streams: List[sherpa.OfflineStream] = [] + for s in samples: + stream = recognizer.create_stream() + stream.accept_samples(s) + streams.append(stream) + + recognizer.decode_streams(streams) + for filename, stream in zip(args.sound_files, streams): + print(f"{filename}\n{stream.result}") + + +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + +# See https://github.com/pytorch/pytorch/issues/38342 +# and https://github.com/pytorch/pytorch/issues/33354 +# +# If we don't do this, the delay increases whenever there is +# a new request that changes the actual batch size. +# If you use `py-spy dump --pid --native`, you will +# see a lot of time is spent in re-compiling the torch script model. +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +""" +// Use the following in C++ +torch::jit::getExecutorMode() = false; +torch::jit::getProfilingMode() = false; +torch::jit::setGraphExecutorOptimize(false); +""" + +if __name__ == "__main__": + torch.manual_seed(20230104) + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa + ) + logging.basicConfig(format=formatter, level=logging.INFO) + + main() diff --git a/sherpa/bin/offline_ctc_server.py b/sherpa/bin/offline_ctc_server.py new file mode 100755 index 000000000..a714304ff --- /dev/null +++ b/sherpa/bin/offline_ctc_server.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 +# Copyright 2022-2023 Xiaomi Corp. +""" +A server for CTC-based offline ASR. Offline means you send all the content +of the audio for recognition. + +It supports multiple clients sending at the same time. + +Usage: + ./offline_ctc_server.py --help + + ./offline_ctc_server.py + +Please refer to +https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_ctc/index.html +for pre-trained models to download. + +We use a Conformer CTC pre-trained model from NeMo below to demonstrate how to use +this file. You can use other non-streaming CTC models with this file +if you want. + +(1) Download pre-trained models + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-conformer-medium +cd sherpa-nemo-ctc-en-conformer-medium +git lfs pull --include "model.pt" + +(2) Start the server + +cd /path/to/sherpa + +./sherpa/bin/offline_ctc_server.py \ + --port 6006 \ + --nemo-normalize=per_feature \ + --nn-model ./sherpa-nemo-ctc-en-conformer-medium/model.pt \ + --tokens ./sherpa-nemo-ctc-en-conformer-medium/tokens.txt + +(3) Start the client + +python3 ./sherpa/bin/offline_client.py ./sherpa-nemo-ctc-en-conformer-medium/test_wavs/0.wav +""" # noqa + +import argparse +import asyncio +import logging +import sys +from pathlib import Path + +import torch +from offline_transducer_server import OfflineServer, add_resources_args + +import sherpa + + +def add_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--nn-model", + type=str, + help="""The torchscript model. Please refer to + https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_ctc/index.html + for a list of pre-trained models to download. + """, + ) + + parser.add_argument( + "--tokens", + type=str, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="Sample rate of the data used to train the model. " + "The client is expected to send audio samples with this sample rate.", + ) + + parser.add_argument( + "--feat-dim", + type=int, + default=80, + help="Feature dimension of the model", + ) + + parser.add_argument( + "--normalize-samples", + type=sherpa.str2bool, + default=True, + help="""If your model was trained using features computed + from samples in the range `[-32768, 32767]`, then please set + this flag to False. For instance, if you use models from WeNet, + please set it to False. + """, + ) + + parser.add_argument( + "--nemo-normalize", + type=str, + default="", + help="""Used only for models from NeMo. + Leave it to empty if the preprocessor of the model does not use + normalization. Current supported value is "per_feature". + """, + ) + + +def add_decoding_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--HLG", + type=str, + help="""Optional. If empty, we use an H graph for decoding. + If not empty, it is the filename of HLG.pt and we will + use it for decoding""", + ) + + parser.add_argument( + "--lm-scale", + type=float, + default=1.0, + help=""" + Used only when --HLG is not empty. It specifies the scale + for HLG.scores + """, + ) + + parser.add_argument( + "--modified", + type=bool, + default=True, + help="""Used only when --HLG is empty. True to use a modified + CTC topology. False to use a standard CTC topology. + Please refer to https://k2-fsa.github.io/k2/python_api/api.html#ctc-topo + for the differences between standard and modified CTC topology. + If you encounter CUDA OOM, then please set this flag to True. + """, + ) + + parser.add_argument( + "--search-beam", + type=float, + default=20.0, + help="""Decoding beam, e.g. 20. Smaller is faster, larger is + more exact (less pruning). This is the default value; + it may be modified by `min_active_states` and + `max_active_states`. + """, + ) + + parser.add_argument( + "--output-beam", + type=float, + default=8.0, + help="""Beam to prune output, similar to lattice-beam in Kaldi. + Relative to the best path of output. + """, + ) + + parser.add_argument( + "--min-active-states", + type=int, + default=30, + help="""Minimum number of FSA states that are allowed to + be active on any given frame for any given + intersection/composition task. This is advisory, + in that it will try not to have fewer than this + number active. Set it to zero if there is no + constraint.""", + ) + + parser.add_argument( + "--max-active-states", + type=int, + default=10000, + help="""Maximum number of FSA states that are allowed to + be active on any given frame for any given + intersection/composition task. This is advisory, + in that it will try not to exceed that but may + not always succeed. You can use a very large + number if no constraint is needed.""", + ) + + +def check_args(args): + if args.use_gpu and not torch.cuda.is_available(): + sys.exit("no CUDA devices available but you set --use-gpu=true") + + if not Path(args.nn_model).is_file(): + raise ValueError(f"{args.nn_model} does not exist") + + if not Path(args.tokens).is_file(): + raise ValueError(f"{args.tokens} does not exist") + + if args.HLG: + assert Path(args.HLG).is_file(), f"{args.HLG} does not exist" + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + add_model_args(parser) + add_decoding_args(parser) + add_resources_args(parser) + + parser.add_argument( + "--port", + type=int, + default=6006, + help="The server will listen on this port", + ) + + parser.add_argument( + "--max-batch-size", + type=int, + default=25, + help="""Max batch size for computation. Note if there are not enough + requests in the queue, it will wait for max_wait_ms time. After that, + even if there are not enough requests, it still sends the + available requests in the queue for computation. + """, + ) + + parser.add_argument( + "--max-wait-ms", + type=float, + default=5, + help="""Max time in millisecond to wait to build batches for inference. + If there are not enough requests in the feature queue to build a batch + of max_batch_size, it waits up to this time before fetching available + requests for computation. + """, + ) + + parser.add_argument( + "--feature-extractor-pool-size", + type=int, + default=5, + help="""Number of threads for feature extraction. By default, feature + extraction runs on CPU. + """, + ) + + parser.add_argument( + "--nn-pool-size", + type=int, + default=1, + help="Number of threads for NN computation and decoding.", + ) + + parser.add_argument( + "--max-message-size", + type=int, + default=(1 << 20), + help="""Max message size in bytes. + The max size per message cannot exceed this limit. + """, + ) + + parser.add_argument( + "--max-queue-size", + type=int, + default=32, + help="Max number of messages in the queue for each connection.", + ) + + parser.add_argument( + "--max-active-connections", + type=int, + default=500, + help="""Maximum number of active connections. The server will refuse + to accept new connections once the current number of active connections + equals to this limit. + """, + ) + + parser.add_argument( + "--certificate", + type=str, + help="""Path to the X.509 certificate. You need it only if you want to + use a secure websocket connection, i.e., use wss:// instead of ws://. + You can use sherpa/bin/web/generate-certificate.py + to generate the certificate `cert.pem`. + """, + ) + + parser.add_argument( + "--doc-root", + type=str, + default="./sherpa/bin/web", + help="""Path to the web root""", + ) + + return parser.parse_args() + + +def create_recognizer(args) -> sherpa.OfflineRecognizer: + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = args.sample_rate + feat_config.fbank_opts.mel_opts.num_bins = args.feat_dim + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + feat_config.normalize_samples = args.normalize_samples + feat_config.nemo_normalize = args.nemo_normalize + + ctc_decoder_config = sherpa.OfflineCtcDecoderConfig( + hlg=args.HLG if args.HLG else "", + lm_scale=args.lm_scale, + modified=args.modified, + search_beam=args.search_beam, + output_beam=args.output_beam, + min_active_states=args.min_active_states, + max_active_states=args.max_active_states, + ) + + config = sherpa.OfflineRecognizerConfig( + nn_model=args.nn_model, + tokens=args.tokens, + use_gpu=args.use_gpu, + feat_config=feat_config, + ctc_decoder_config=ctc_decoder_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@torch.no_grad() +def main(): + args = get_args() + logging.info(vars(args)) + check_args(args) + + torch.set_num_threads(args.num_threads) + torch.set_num_interop_threads(args.num_threads) + recognizer = create_recognizer(args) + + port = args.port + max_wait_ms = args.max_wait_ms + max_batch_size = args.max_batch_size + feature_extractor_pool_size = args.feature_extractor_pool_size + nn_pool_size = args.nn_pool_size + max_message_size = args.max_message_size + max_queue_size = args.max_queue_size + max_active_connections = args.max_active_connections + certificate = args.certificate + doc_root = args.doc_root + + if certificate and not Path(certificate).is_file(): + raise ValueError(f"{certificate} does not exist") + + if not Path(doc_root).is_dir(): + raise ValueError(f"Directory {doc_root} does not exist") + + offline_server = OfflineServer( + recognizer=recognizer, + max_wait_ms=max_wait_ms, + max_batch_size=max_batch_size, + feature_extractor_pool_size=feature_extractor_pool_size, + nn_pool_size=nn_pool_size, + max_message_size=max_message_size, + max_queue_size=max_queue_size, + max_active_connections=max_active_connections, + certificate=certificate, + doc_root=doc_root, + ) + asyncio.run(offline_server.run(port)) + + +# See https://github.com/pytorch/pytorch/issues/38342 +# and https://github.com/pytorch/pytorch/issues/33354 +# +# If we don't do this, the delay increases whenever there is +# a new request that changes the actual batch size. +# If you use `py-spy dump --pid --native`, you will +# see a lot of time is spent in re-compiling the torch script model. +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +""" +// Use the following in C++ +torch::jit::getExecutorMode() = false; +torch::jit::getProfilingMode() = false; +torch::jit::setGraphExecutorOptimize(false); +""" + +if __name__ == "__main__": + log_filename = "log/log-offline-ctc-server" + sherpa.setup_logger(log_filename) + main() diff --git a/sherpa/bin/offline_transducer_asr.py b/sherpa/bin/offline_transducer_asr.py new file mode 100755 index 000000000..21bdaa6b2 --- /dev/null +++ b/sherpa/bin/offline_transducer_asr.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023 Xiaomi Corporation + +""" +A standalone script for offline (i.e., non-streaming) speech recognition. + +This file decodes files without the need to start a server and a client. + +Please refer to +https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_transducer.html# +for pre-trained models to download. + +See +https://k2-fsa.github.io/sherpa/python/offline_asr/standalone/transducer.html +for detailed usages and also you can find a colab notebook there. + +We use the Zipformer pre-trained model below to demonstrate how to use +this file: + +(1) Download pre-trained models + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 +cd icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02 +git lfs pull --include "exp/cpu_jit-torch-1.10.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" + +(2) greedy_search + +cd /path/to/sherpa + +./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method greedy_search \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +(3) modified_beam_search + +cd /path/to/sherpa + +./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method modified_beam_search \ + --num-active-paths 4 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +(4) fast_beam_search (without LG) + +cd /path/to/sherpa + +./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method fast_beam_search \ + --max-contexts 8 \ + --max-states 64 \ + --allow-partial true \ + --beam 4 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav + +(5) fast_beam_search (with LG) + +cd /path/to/sherpa + +./sherpa/bin/offline_transducer_asr.py \ + --nn-model ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/exp/cpu_jit-torch-1.10.pt \ + --tokens ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/tokens.txt \ + --decoding-method fast_beam_search \ + --max-contexts 8 \ + --max-states 64 \ + --allow-partial true \ + --beam 4 \ + --LG ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/data/lang_bpe_500/LG.pt \ + --ngram-lm-scale 0.01 \ + --use-gpu false \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02/test_wavs/1221-135766-0002.wav +""" # noqa +import argparse +import logging +from pathlib import Path +from typing import List + +import torch +import torchaudio +import sentencepiece as spm + +import sherpa +from sherpa import str2bool + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + add_model_args(parser) + add_decoding_args(parser) + add_resources_args(parser) + + parser.add_argument( + "sound_files", + type=str, + nargs="+", + help="The input sound file(s) to transcribe. " + "Supported formats are those supported by torchaudio.load(). " + "For example, wav and flac are supported. ", + ) + + return parser + + +def add_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--nn-model", + type=str, + help="""The torchscript model. Please refer to + https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_transducer.html + for a list of pre-trained models to download. + """, + ) + + parser.add_argument( + "--tokens", + type=str, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="Sample rate of the data used to train the model. " + "Caution: If your input sound files have a different sampling rate, " + "we will do resampling inside", + ) + + parser.add_argument( + "--feat-dim", + type=int, + default=80, + help="Feature dimension of the model", + ) + + parser.add_argument( + "--use-bbpe", + type=str2bool, + default=False, + help="Whether the model to be used is trained with bbpe", + ) + + +def add_decoding_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--decoding-method", + type=str, + help="""Decoding method to use. Current supported methods are: + - greedy_search + - modified_beam_search + - fast_beam_search + """, + ) + + add_modified_beam_search_args(parser) + add_fast_beam_search_args(parser) + + +def add_modified_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--num-active-paths", + type=int, + default=4, + help="""Used only when --decoding-method is modified_beam_search. + It specifies number of active paths to keep during decoding. + """, + ) + + parser.add_argument( + "--bpe-model", + type=str, + default="", + help=""" + Path to bpe.model, it will be used to tokenize contexts biasing phrases. + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--modeling-unit", + type=str, + default="char", + help=""" + The type of modeling unit, it will be used to tokenize contexts biasing + phrases. Valid values are bpe, bpe+char, char. + Note: the char here means characters in CJK languages. + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--contexts", + type=str, + default="", + help=""" + The context list, it is a string containing some words/phrases separated + with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY". + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--context-score", + type=float, + default=1.5, + help=""" + The context score of each token for biasing word/phrase. Used only if + --contexts is given. + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help="""Used only when --decoding-method is modified_beam_search. + It specifies the softmax temperature. + """, + ) + + +def add_fast_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--max-contexts", + type=int, + default=8, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--max-states", + type=int, + default=64, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--allow-partial", + type=str2bool, + default=True, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--LG", + type=str, + default="", + help="""Used only when --decoding-method is fast_beam_search. + If not empty, it points to LG.pt. + """, + ) + + parser.add_argument( + "--ngram-lm-scale", + type=float, + default=0.01, + help=""" + Used only when --decoding_method is fast_beam_search and + --LG is not empty. + """, + ) + + parser.add_argument( + "--beam", + type=float, + default=4, + help="""A floating point value to calculate the cutoff score during beam + search (i.e., `cutoff = max-score - beam`), which is the same as the + `beam` in Kaldi. + Used only when --method is fast_beam_search""", + ) + + +def add_resources_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--use-gpu", + type=str2bool, + default=False, + help="""True to use GPU. It always selects GPU 0. You can use the + environement variable CUDA_VISIBLE_DEVICES to control which GPU + is mapped to GPU 0. + """, + ) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="Sets the number of threads used for interop parallelism " + "(e.g. in JIT interpreter) on CPU.", + ) + + +def check_args(args): + if not Path(args.nn_model).is_file(): + raise ValueError(f"{args.nn_model} does not exist") + + if not Path(args.tokens).is_file(): + raise ValueError(f"{args.tokens} does not exist") + + if args.decoding_method not in ( + "greedy_search", + "modified_beam_search", + "fast_beam_search", + ): + raise ValueError(f"Unsupported decoding method {args.decoding_method}") + + if args.contexts.strip() != "": + assert ( + args.decoding_method == "modified_beam_search" + ), "Contextual-biasing only supported in modified_beam_search." + if "bpe" in args.modeling_unit: + assert Path( + args.bpe_model + ).is_file(), f"{args.bpe_model} does not exist" + + if args.decoding_method == "modified_beam_search": + assert args.num_active_paths > 0, args.num_active_paths + assert args.temperature > 0, args.temperature + + if args.decoding_method == "fast_beam_search" and args.LG: + if not Path(args.LG).is_file(): + raise ValueError(f"{args.LG} does not exist") + + assert len(args.sound_files) > 0, args.sound_files + for f in args.sound_files: + if not Path(f).is_file(): + raise ValueError(f"{f} does not exist") + + +def read_sound_files( + filenames: List[str], expected_sample_rate: float +) -> List[torch.Tensor]: + """Read a list of sound files into a list 1-D float32 torch tensors. + Args: + filenames: + A list of sound filenames. + expected_sample_rate: + The expected sample rate of the sound files. + Returns: + Return a list of 1-D float32 torch tensors. + """ + ans = [] + for f in filenames: + wave, sample_rate = torchaudio.load(f) + if sample_rate != expected_sample_rate: + wave = torchaudio.functional.resample( + wave, + orig_freq=sample_rate, + new_freq=expected_sample_rate, + ) + + # We use only the first channel + ans.append(wave[0].contiguous()) + return ans + + +def encode_contexts(args, contexts: List[str]) -> List[List[int]]: + sp = None + if "bpe" in args.modeling_unit: + sp = spm.SentencePieceProcessor() + sp.load(args.bpe_model) + tokens = {} + with open(args.tokens, "r", encoding="utf-8") as f: + for line in f: + toks = line.strip().split() + assert len(toks) == 2, len(toks) + assert toks[0] not in tokens, f"Duplicate token: {toks} " + tokens[toks[0]] = int(toks[1]) + return sherpa.encode_contexts( + modeling_unit=args.modeling_unit, + contexts=contexts, + sp=sp, + tokens_table=tokens, + ) + + +def create_recognizer(args) -> sherpa.OfflineRecognizer: + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = args.sample_rate + feat_config.fbank_opts.mel_opts.num_bins = args.feat_dim + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + fast_beam_search_config = sherpa.FastBeamSearchConfig( + lg=args.LG if args.LG else "", + ngram_lm_scale=args.ngram_lm_scale, + beam=args.beam, + max_states=args.max_states, + max_contexts=args.max_contexts, + allow_partial=args.allow_partial, + ) + + config = sherpa.OfflineRecognizerConfig( + nn_model=args.nn_model, + tokens=args.tokens, + use_gpu=args.use_gpu, + num_active_paths=args.num_active_paths, + context_score=args.context_score, + use_bbpe=args.use_bbpe, + feat_config=feat_config, + decoding_method=args.decoding_method, + fast_beam_search_config=fast_beam_search_config, + temperature=args.temperature, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +def main(): + args = get_parser().parse_args() + logging.info(vars(args)) + check_args(args) + + torch.set_num_threads(args.num_threads) + torch.set_num_interop_threads(args.num_threads) + + recognizer = create_recognizer(args) + sample_rate = args.sample_rate + + samples: List[torch.Tensor] = read_sound_files( + args.sound_files, + sample_rate, + ) + + contexts_list = [] + contexts = [ + x.strip().upper() for x in args.contexts.split("/") if x.strip() + ] + if contexts: + print(f"Contexts list: {contexts}") + contexts_list = encode_contexts(args, contexts) + + streams: List[sherpa.OfflineStream] = [] + for s in samples: + if contexts_list: + stream = recognizer.create_stream(contexts_list=contexts_list) + else: + stream = recognizer.create_stream() + stream.accept_samples(s) + streams.append(stream) + + recognizer.decode_streams(streams) + for filename, stream in zip(args.sound_files, streams): + print(f"{filename}\n{stream.result}") + + +# See https://github.com/pytorch/pytorch/issues/38342 +# and https://github.com/pytorch/pytorch/issues/33354 +# +# If we don't do this, the delay increases whenever there is +# a new request that changes the actual batch size. +# If you use `py-spy dump --pid --native`, you will +# see a lot of time is spent in re-compiling the torch script model. +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +""" +// Use the following in C++ +torch::jit::getExecutorMode() = false; +torch::jit::getProfilingMode() = false; +torch::jit::setGraphExecutorOptimize(false); +""" + +if __name__ == "__main__": + torch.manual_seed(20230104) + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa + logging.basicConfig(format=formatter, level=logging.INFO) + + main() +else: + torch.set_num_threads(1) + torch.set_num_interop_threads(1) diff --git a/sherpa/bin/offline_transducer_server.py b/sherpa/bin/offline_transducer_server.py new file mode 100755 index 000000000..a1f0a6b72 --- /dev/null +++ b/sherpa/bin/offline_transducer_server.py @@ -0,0 +1,730 @@ +#!/usr/bin/env python3 +# Copyright 2022-2023 Xiaomi Corp. +""" +A server for transducer-based offline ASR. Offline means you send all +the content of the audio for recognition. + +It supports multiple clients sending at the same time. + +Usage: + ./offline_transducer_server.py --help + + ./offline_transducer_server.py + +Please refer to +https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_transducer.html +for pre-trained models to download. + +We use the Zipformer pre-trained model below to demonstrate how to use +this file. You can use other non-streaming transducer models with this file +if you want. + +(1) Download pre-trained models + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15 +cd icefall-asr-librispeech-zipformer-2023-05-15 + +git lfs pull --include "exp/jit_script.pt" + +(2) Start the server + +cd /path/to/sherpa + +./sherpa/bin/offline_transducer_server.py \ + --port 6006 \ + --nn-model ./icefall-asr-librispeech-zipformer-2023-05-15/exp/jit_script.pt \ + --tokens ./icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/tokens.txt + +(3) Start the client + +python3 ./sherpa/bin/offline_client.py ./icefall-asr-librispeech-zipformer-2023-05-15/test_wavs/1089-134686-0001.wav +""" # noqa + +import argparse +import asyncio +import http +import logging +import socket +import ssl +import sys +import warnings +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Optional, Tuple + +import numpy as np +import torch +import websockets + +import sherpa + + +def add_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--nn-model", + type=str, + help="""The torchscript model. Please refer to + https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_transducer.html + for a list of pre-trained models to download. + """, + ) + + parser.add_argument( + "--tokens", + type=str, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="Sample rate of the data used to train the model. ", + ) + + parser.add_argument( + "--feat-dim", + type=int, + default=80, + help="Feature dimension of the model", + ) + + parser.add_argument( + "--use-bbpe", + type=sherpa.str2bool, + default=False, + help="Whether the model to be used is trained with bbpe", + ) + + +def add_decoding_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="""Decoding method to use. Current supported methods are: + - greedy_search + - modified_beam_search + - fast_beam_search + """, + ) + + add_modified_beam_search_args(parser) + add_fast_beam_search_args(parser) + + +def add_modified_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--num-active-paths", + type=int, + default=4, + help="""Used only when --decoding-method is modified_beam_search. + It specifies number of active paths to keep during decoding. + """, + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help="""Used only when --decoding-method is modified_beam_search. + It specifies the softmax temperature. + """, + ) + + +def add_fast_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--max-contexts", + type=int, + default=8, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--max-states", + type=int, + default=64, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--allow-partial", + type=sherpa.str2bool, + default=True, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--LG", + type=str, + default="", + help="""Used only when --decoding-method is fast_beam_search. + If not empty, it points to LG.pt. + """, + ) + + parser.add_argument( + "--ngram-lm-scale", + type=float, + default=0.01, + help=""" + Used only when --decoding_method is fast_beam_search and + --LG is not empty. + """, + ) + + parser.add_argument( + "--beam", + type=float, + default=4, + help="""A floating point value to calculate the cutoff score during beam + search (i.e., `cutoff = max-score - beam`), which is the same as the + `beam` in Kaldi. + Used only when --method is fast_beam_search""", + ) + + +def add_resources_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--use-gpu", + type=sherpa.str2bool, + default=False, + help="""True to use GPU. It always selects GPU 0. You can use the + environement variable CUDA_VISIBLE_DEVICES to control which GPU + is mapped to GPU 0. + """, + ) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="Sets the number of threads used for interop parallelism " + "(e.g. in JIT interpreter) on CPU.", + ) + + +def check_args(args): + if args.use_gpu and not torch.cuda.is_available(): + sys.exit("no CUDA devices available but you set --use-gpu=true") + + if not Path(args.nn_model).is_file(): + raise ValueError(f"{args.nn_model} does not exist") + + if not Path(args.tokens).is_file(): + raise ValueError(f"{args.tokens} does not exist") + + if args.decoding_method not in ( + "greedy_search", + "modified_beam_search", + "fast_beam_search", + ): + raise ValueError(f"Unsupported decoding method {args.decoding_method}") + + if args.decoding_method == "modified_beam_search": + assert args.num_active_paths > 0, args.num_active_paths + assert args.temperature > 0, args.temperature + + if args.decoding_method == "fast_beam_search" and args.LG: + if not Path(args.LG).is_file(): + raise ValueError(f"{args.LG} does not exist") + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + add_model_args(parser) + add_decoding_args(parser) + add_resources_args(parser) + + parser.add_argument( + "--port", + type=int, + default=6006, + help="The server will listen on this port", + ) + + parser.add_argument( + "--max-batch-size", + type=int, + default=25, + help="""Max batch size for computation. Note if there are not enough + requests in the queue, it will wait for max_wait_ms time. After that, + even if there are not enough requests, it still sends the + available requests in the queue for computation. + """, + ) + + parser.add_argument( + "--max-wait-ms", + type=float, + default=5, + help="""Max time in millisecond to wait to build batches for inference. + If there are not enough requests in the feature queue to build a batch + of max_batch_size, it waits up to this time before fetching available + requests for computation. + """, + ) + + parser.add_argument( + "--feature-extractor-pool-size", + type=int, + default=5, + help="""Number of threads for feature extraction. By default, feature + extraction runs on CPU. + """, + ) + + parser.add_argument( + "--nn-pool-size", + type=int, + default=1, + help="Number of threads for NN computation and decoding.", + ) + + parser.add_argument( + "--max-message-size", + type=int, + default=(1 << 20), + help="""Max message size in bytes. + The max size per message cannot exceed this limit. + """, + ) + + parser.add_argument( + "--max-queue-size", + type=int, + default=32, + help="Max number of messages in the queue for each connection.", + ) + + parser.add_argument( + "--max-active-connections", + type=int, + default=500, + help="""Maximum number of active connections. The server will refuse + to accept new connections once the current number of active connections + equals to this limit. + """, + ) + + parser.add_argument( + "--certificate", + type=str, + help="""Path to the X.509 certificate. You need it only if you want to + use a secure websocket connection, i.e., use wss:// instead of ws://. + You can use sherpa/bin/web/generate-certificate.py + to generate the certificate `cert.pem`. + """, + ) + + parser.add_argument( + "--doc-root", + type=str, + default="./sherpa/bin/web", + help="""Path to the web root""", + ) + + return parser.parse_args() + + +class OfflineServer: + def __init__( + self, + recognizer: sherpa.OfflineRecognizer, + max_batch_size: int, + max_wait_ms: float, + feature_extractor_pool_size: int, + nn_pool_size: int, + max_message_size: int, + max_queue_size: int, + max_active_connections: int, + doc_root: str, + certificate: Optional[str] = None, + ): + """ + Args: + recognizer: + An instance of the sherpa.OfflineRecognizer. + max_batch_size: + Max batch size for inference. + max_wait_ms: + Max wait time in milliseconds in order to build a batch of + `max_batch_size`. + feature_extractor_pool_size: + Number of threads to create for the feature extractor thread pool. + nn_pool_size: + Number of threads for the thread pool that is used for NN + computation and decoding. + max_message_size: + Max size in bytes per message. + max_queue_size: + Max number of messages in the queue for each connection. + max_active_connections: + Max number of active connections. Once number of active client + equals to this limit, the server refuses to accept new connections. + doc_root: + Path to the directory where files like index.html for the HTTP + server locate. + certificate: + Optional. If not None, it will use secure websocket. + You can use ./sherpa/bin/web/generate-certificate.py to generate + it (the default generated filename is `cert.pem`). + """ + self.recognizer = recognizer + + self.certificate = certificate + self.http_server = sherpa.HttpServer(doc_root) + + self.nn_pool = ThreadPoolExecutor( + max_workers=nn_pool_size, + thread_name_prefix="nn", + ) + + self.stream_queue = asyncio.Queue() + + self.max_wait_ms = max_wait_ms + self.max_batch_size = max_batch_size + self.max_message_size = max_message_size + self.max_queue_size = max_queue_size + self.max_active_connections = max_active_connections + + self.current_active_connections = 0 + + async def process_request( + self, + path: str, + request_headers: websockets.Headers, + ) -> Optional[Tuple[http.HTTPStatus, websockets.Headers, bytes]]: + if "sec-websocket-key" not in request_headers: + # This is a normal HTTP request + if path == "/": + path = "/index.html" + found, response, mime_type = self.http_server.process_request(path) + if isinstance(response, str): + response = response.encode("utf-8") + + if not found: + status = http.HTTPStatus.NOT_FOUND + else: + status = http.HTTPStatus.OK + header = {"Content-Type": mime_type} + return status, header, response + + if self.current_active_connections < self.max_active_connections: + self.current_active_connections += 1 + return None + + # Refuse new connections + status = http.HTTPStatus.SERVICE_UNAVAILABLE # 503 + header = {"Hint": "The server is overloaded. Please retry later."} + response = b"The server is busy. Please retry later." + + return status, header, response + + async def run(self, port: int): + logging.info("started") + + task = asyncio.create_task(self.stream_consumer_task()) + + if self.certificate: + logging.info(f"Using certificate: {self.certificate}") + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ssl_context.load_cert_chain(self.certificate) + else: + ssl_context = None + logging.info("No certificate provided") + + async with websockets.serve( + self.handle_connection, + host="", + port=port, + max_size=self.max_message_size, + max_queue=self.max_queue_size, + process_request=self.process_request, + ssl=ssl_context, + ): + ip_list = ["0.0.0.0", "localhost", "127.0.0.1"] + ip_list.append(socket.gethostbyname(socket.gethostname())) + proto = "http://" if ssl_context is None else "https://" + s = "Please visit one of the following addresses:\n\n" + for p in ip_list: + s += " " + proto + p + f":{port}" "\n" + logging.info(s) + + await asyncio.Future() # run forever + await task + + async def recv_audio_samples( + self, + socket: websockets.WebSocketServerProtocol, + ) -> Optional[torch.Tensor]: + """Receives a tensor from the client. + + As the websocket protocol is a message based protocol, not a stream + protocol, we can receive the whole message sent by the client at once. + + The message from the client is a **bytes** buffer. + + The first message can be either "Done" meaning the client won't send + anything in the future or it can be a buffer containing 4 bytes + in **little** endian format, specifying the number of bytes in the audio + file, which will be sent by the client in the subsequent messages. + Since there is a limit in the message size posed by the websocket + protocol, the client may send the audio file in multiple messages if the + audio file is very large. + + The second and remaining messages contain audio samples. + + Args: + socket: + The socket for communicating with the client. + Returns: + Return a 1-D torch.float32 tensor containing the audio samples or + return None indicating the end of utterance. + """ + header = await socket.recv() + if header == "Done": + return None + + assert len(header) == 4, "The first message should contain 4 bytes" + + expected_num_bytes = int.from_bytes(header, "little", signed=True) + + received = [] + num_received_bytes = 0 + async for message in socket: + received.append(message) + num_received_bytes += len(message) + + if num_received_bytes >= expected_num_bytes: + break + + assert num_received_bytes == expected_num_bytes, ( + num_received_bytes, + expected_num_bytes, + ) + + samples = b"".join(received) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # PyTorch warns that the underlying buffer is not writable. + # We ignore it here as we are not going to write it anyway. + if hasattr(torch, "frombuffer"): + # Note: torch.frombuffer is available only in torch>= 1.10 + return torch.frombuffer(samples, dtype=torch.float32) + else: + array = np.frombuffer(samples, dtype=np.float32) + return torch.from_numpy(array) + + async def stream_consumer_task(self): + """This function extracts streams from the queue, batches them up, sends + them to the RNN-T model for computation and decoding. + """ + while True: + if self.stream_queue.empty(): + await asyncio.sleep(self.max_wait_ms / 1000) + continue + + batch = [] + try: + while len(batch) < self.max_batch_size: + item = self.stream_queue.get_nowait() + + batch.append(item) + except asyncio.QueueEmpty: + pass + stream_list = [b[0] for b in batch] + future_list = [b[1] for b in batch] + + loop = asyncio.get_running_loop() + await loop.run_in_executor( + self.nn_pool, + self.recognizer.decode_streams, + stream_list, + ) + + for f in future_list: + self.stream_queue.task_done() + f.set_result(None) + + async def compute_and_decode( + self, + stream: sherpa.OfflineStream, + ) -> None: + """Put the stream into the queue and wait it to be processed by the + consumer task. + + Args: + stream: + The stream to be processed. Note: It is changed in-place. + """ + loop = asyncio.get_running_loop() + future = loop.create_future() + await self.stream_queue.put((stream, future)) + await future + + async def handle_connection( + self, + socket: websockets.WebSocketServerProtocol, + ): + """Receive audio samples from the client, process it, and sends + deocoding result back to the client. + + Args: + socket: + The socket for communicating with the client. + """ + try: + await self.handle_connection_impl(socket) + except websockets.exceptions.ConnectionClosedError: + logging.info(f"{socket.remote_address} disconnected") + finally: + # Decrement so that it can accept new connections + self.current_active_connections -= 1 + + logging.info( + f"Disconnected: {socket.remote_address}. " + f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa + ) + + async def handle_connection_impl( + self, + socket: websockets.WebSocketServerProtocol, + ): + """Receive audio samples from the client, process it, and send + deocoding result back to the client. + + Args: + socket: + The socket for communicating with the client. + """ + logging.info( + f"Connected: {socket.remote_address}. " + f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa + ) + + while True: + stream = self.recognizer.create_stream() + samples = await self.recv_audio_samples(socket) + if samples is None: + break + # stream.accept_samples() runs in the main thread + # TODO(fangjun): Use a separate thread/process pool for it + stream.accept_samples(samples) + + await self.compute_and_decode(stream) + result = stream.result.text + logging.info(f"result: {result}") + + if result: + await socket.send(result) + else: + # If result is an empty string, send something to the client. + # Otherwise, socket.send() is a no-op and the client will + # wait for a reply indefinitely. + await socket.send("") + + +def create_recognizer(args) -> sherpa.OfflineRecognizer: + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = args.sample_rate + feat_config.fbank_opts.mel_opts.num_bins = args.feat_dim + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + fast_beam_search_config = sherpa.FastBeamSearchConfig( + lg=args.LG if args.LG else "", + ngram_lm_scale=args.ngram_lm_scale, + beam=args.beam, + max_states=args.max_states, + max_contexts=args.max_contexts, + allow_partial=args.allow_partial, + ) + + config = sherpa.OfflineRecognizerConfig( + nn_model=args.nn_model, + tokens=args.tokens, + use_gpu=args.use_gpu, + num_active_paths=args.num_active_paths, + use_bbpe=args.use_bbpe, + feat_config=feat_config, + decoding_method=args.decoding_method, + fast_beam_search_config=fast_beam_search_config, + temperature=args.temperature + ) + + recognizer = sherpa.OfflineRecognizer(config) + + return recognizer + + +@torch.no_grad() +def main(): + args = get_args() + logging.info(vars(args)) + check_args(args) + + torch.set_num_threads(args.num_threads) + torch.set_num_interop_threads(args.num_threads) + recognizer = create_recognizer(args) + + port = args.port + max_wait_ms = args.max_wait_ms + max_batch_size = args.max_batch_size + feature_extractor_pool_size = args.feature_extractor_pool_size + nn_pool_size = args.nn_pool_size + max_message_size = args.max_message_size + max_queue_size = args.max_queue_size + max_active_connections = args.max_active_connections + certificate = args.certificate + doc_root = args.doc_root + + if certificate and not Path(certificate).is_file(): + raise ValueError(f"{certificate} does not exist") + + if not Path(doc_root).is_dir(): + raise ValueError(f"Directory {doc_root} does not exist") + + offline_server = OfflineServer( + recognizer=recognizer, + max_wait_ms=max_wait_ms, + max_batch_size=max_batch_size, + feature_extractor_pool_size=feature_extractor_pool_size, + nn_pool_size=nn_pool_size, + max_message_size=max_message_size, + max_queue_size=max_queue_size, + max_active_connections=max_active_connections, + certificate=certificate, + doc_root=doc_root, + ) + asyncio.run(offline_server.run(port)) + + +# See https://github.com/pytorch/pytorch/issues/38342 +# and https://github.com/pytorch/pytorch/issues/33354 +# +# If we don't do this, the delay increases whenever there is +# a new request that changes the actual batch size. +# If you use `py-spy dump --pid --native`, you will +# see a lot of time is spent in re-compiling the torch script model. +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +""" +// Use the following in C++ +torch::jit::getExecutorMode() = false; +torch::jit::getProfilingMode() = false; +torch::jit::setGraphExecutorOptimize(false); +""" + +if __name__ == "__main__": + log_filename = "log/log-offline-transducer-server" + sherpa.setup_logger(log_filename) + main() diff --git a/sherpa/bin/online_transducer_asr.py b/sherpa/bin/online_transducer_asr.py new file mode 100755 index 000000000..36d5d28c6 --- /dev/null +++ b/sherpa/bin/online_transducer_asr.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023 Xiaomi Corporation + +""" +A standalone script for online (i.e., streaming) speech recognition. + +This file decodes files without the need to start a server and a client. + +Please refer to +https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html# +for pre-trained models to download. + +See +https://k2-fsa.github.io/sherpa/python/streaming_asr/standalone/transducer.html +for detailed usages. + +The following example demonstrates the usage of this file with a pre-trained +streaming zipformer model for English. + +(1) Download the pre-trained model + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + +cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" + +(2) greedy_search + +cd /path/to/sherpa + +python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="greedy_search" \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav + +(3) modified_beam_search + +cd /path/to/sherpa + +python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="modified_beam_search" \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav + +(4) fast_beam_search + +cd /path/to/sherpa + +python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="fast_beam_search" \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav + +(5) fast_beam_search with LG + +cd /path/to/sherpa + +python3 ./sherpa/bin/online_transducer_asr.py \ + --decoding-method="fast_beam_search" \ + --LG=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/LG.pt \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0001.wav \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1221-135766-0002.wav +""" # noqa + +import argparse +import logging +from pathlib import Path +from typing import List + +import torch +import torchaudio +import sentencepiece as spm + +import sherpa +from sherpa import str2bool + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + add_model_args(parser) + add_decoding_args(parser) + add_resources_args(parser) + + parser.add_argument( + "sound_files", + type=str, + nargs="+", + help="The input sound file(s) to transcribe. " + "Supported formats are those supported by torchaudio.load(). " + "For example, wav and flac are supported. ", + ) + + return parser + + +def add_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--nn-model", + type=str, + help="""The torchscript model. Please refer to + https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html + for a list of pre-trained models to download. + """, + ) + + parser.add_argument( + "--tokens", + type=str, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="Sample rate of the data used to train the model. " + "Caution: If your input sound files have a different sampling rate, " + "we will do resampling inside", + ) + + parser.add_argument( + "--feat-dim", + type=int, + default=80, + help="Feature dimension of the model", + ) + + parser.add_argument( + "--use-bbpe", + type=str2bool, + default=False, + help="Whether the model to be used is trained with bbpe", + ) + + +def add_decoding_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="""Decoding method to use. Current supported methods are: + - greedy_search + - modified_beam_search + - fast_beam_search + """, + ) + + add_modified_beam_search_args(parser) + add_fast_beam_search_args(parser) + + +def add_modified_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--num-active-paths", + type=int, + default=4, + help="""Used only when --decoding-method is modified_beam_search. + It specifies number of active paths to keep during decoding. + """, + ) + + parser.add_argument( + "--bpe-model", + type=str, + default="", + help=""" + Path to bpe.model, it will be used to tokenize contexts biasing phrases. + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--modeling-unit", + type=str, + default="char", + help=""" + The type of modeling unit, it will be used to tokenize contexts biasing + phrases. Valid values are bpe, bpe+char, char. + Note: the char here means characters in CJK languages. + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--contexts", + type=str, + default="", + help=""" + The context list, it is a string containing some words/phrases separated + with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY". + Used only when --decoding-method=modified_beam_search + """, + ) + + parser.add_argument( + "--context-score", + type=float, + default=1.5, + help=""" + The context score of each token for biasing word/phrase. Used only if + --contexts is given. + Used only when --decoding-method=modified_beam_search + """, + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help="""Used only when --decoding-method is modified_beam_search. + It specifies the softmax temperature. + """, + ) + + +def add_fast_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--max-contexts", + type=int, + default=8, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--max-states", + type=int, + default=64, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--allow-partial", + type=str2bool, + default=True, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--LG", + type=str, + default="", + help="""Used only when --decoding-method is fast_beam_search. + If not empty, it points to LG.pt. + """, + ) + + parser.add_argument( + "--ngram-lm-scale", + type=float, + default=0.01, + help=""" + Used only when --decoding_method is fast_beam_search and + --LG is not empty. + """, + ) + + parser.add_argument( + "--beam", + type=float, + default=4, + help="""A floating point value to calculate the cutoff score during beam + search (i.e., `cutoff = max-score - beam`), which is the same as the + `beam` in Kaldi. + Used only when --method is fast_beam_search""", + ) + + +def add_resources_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--use-gpu", + type=str2bool, + default=False, + help="""True to use GPU. It always selects GPU 0. You can use the + environement variable CUDA_VISIBLE_DEVICES to control which GPU + is mapped to GPU 0. + """, + ) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="Sets the number of threads used for interop parallelism " + "(e.g. in JIT interpreter) on CPU.", + ) + + +def check_args(args): + if not Path(args.nn_model).is_file(): + raise ValueError(f"{args.nn_model} does not exist") + + if not Path(args.tokens).is_file(): + raise ValueError(f"{args.tokens} does not exist") + + if args.decoding_method not in ( + "greedy_search", + "modified_beam_search", + "fast_beam_search", + ): + raise ValueError(f"Unsupported decoding method {args.decoding_method}") + + if args.contexts.strip() != "": + assert ( + args.decoding_method == "modified_beam_search" + ), "Contextual-biasing only supported in modified_beam_search." + if "bpe" in args.modeling_unit: + assert Path( + args.bpe_model + ).is_file(), f"{args.bpe_model} does not exist" + + if args.decoding_method == "modified_beam_search": + assert args.num_active_paths > 0, args.num_active_paths + assert args.temperature > 0, args.temperature + + if args.decoding_method == "fast_beam_search" and args.LG: + if not Path(args.LG).is_file(): + raise ValueError(f"{args.LG} does not exist") + + assert len(args.sound_files) > 0, args.sound_files + for f in args.sound_files: + if not Path(f).is_file(): + raise ValueError(f"{f} does not exist") + + +def read_sound_files( + filenames: List[str], expected_sample_rate: float +) -> List[torch.Tensor]: + """Read a list of sound files into a list 1-D float32 torch tensors. + Args: + filenames: + A list of sound filenames. + expected_sample_rate: + The expected sample rate of the sound files. + Returns: + Return a list of 1-D float32 torch tensors. + """ + ans = [] + for f in filenames: + wave, sample_rate = torchaudio.load(f) + if sample_rate != expected_sample_rate: + wave = torchaudio.functional.resample( + wave, + orig_freq=sample_rate, + new_freq=expected_sample_rate, + ) + + # We use only the first channel + ans.append(wave[0].contiguous()) + return ans + + +def encode_contexts(args, contexts: List[str]) -> List[List[int]]: + sp = None + if "bpe" in args.modeling_unit: + sp = spm.SentencePieceProcessor() + sp.load(args.bpe_model) + tokens = {} + with open(args.tokens, "r", encoding="utf-8") as f: + for line in f: + toks = line.strip().split() + assert len(toks) == 2, len(toks) + assert toks[0] not in tokens, f"Duplicate token: {toks} " + tokens[toks[0]] = int(toks[1]) + return sherpa.encode_contexts( + modeling_unit=args.modeling_unit, + contexts=contexts, + sp=sp, + tokens_table=tokens, + ) + + +def create_recognizer(args) -> sherpa.OnlineRecognizer: + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = args.sample_rate + feat_config.fbank_opts.mel_opts.num_bins = args.feat_dim + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + fast_beam_search_config = sherpa.FastBeamSearchConfig( + lg=args.LG if args.LG else "", + ngram_lm_scale=args.ngram_lm_scale, + beam=args.beam, + max_states=args.max_states, + max_contexts=args.max_contexts, + allow_partial=args.allow_partial, + ) + + config = sherpa.OnlineRecognizerConfig( + nn_model=args.nn_model, + tokens=args.tokens, + use_gpu=args.use_gpu, + num_active_paths=args.num_active_paths, + context_score=args.context_score, + use_bbpe=args.use_bbpe, + feat_config=feat_config, + decoding_method=args.decoding_method, + fast_beam_search_config=fast_beam_search_config, + temperature=args.temperature + ) + + recognizer = sherpa.OnlineRecognizer(config) + + return recognizer + + +def main(): + args = get_parser().parse_args() + logging.info(vars(args)) + check_args(args) + + torch.set_num_threads(args.num_threads) + torch.set_num_interop_threads(args.num_threads) + + recognizer = create_recognizer(args) + sample_rate = args.sample_rate + + samples: List[torch.Tensor] = read_sound_files( + args.sound_files, + sample_rate, + ) + + tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32) + + contexts_list = [] + contexts = [ + x.strip().upper() for x in args.contexts.split("/") if x.strip() + ] + if contexts: + print(f"Contexts list: {contexts}") + contexts_list = encode_contexts(args, contexts) + + streams: List[sherpa.OnlineStream] = [] + for s in samples: + if contexts_list: + stream = recognizer.create_stream(contexts_list=contexts_list) + else: + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, s) + stream.accept_waveform(sample_rate, tail_padding) + stream.input_finished() + streams.append(stream) + + while True: + ready_streams = [] + for s in streams: + if recognizer.is_ready(s): + ready_streams.append(s) + + if len(ready_streams) == 0: + break + + recognizer.decode_streams(ready_streams) + + print("-" * 10) + for filename, s in zip(args.sound_files, streams): + print(f"{filename}\n{recognizer.get_result(s).text}") + print("-" * 10) + + +# See https://github.com/pytorch/pytorch/issues/38342 +# and https://github.com/pytorch/pytorch/issues/33354 +# +# If we don't do this, the delay increases whenever there is +# a new request that changes the actual batch size. +# If you use `py-spy dump --pid --native`, you will +# see a lot of time is spent in re-compiling the torch script model. +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +""" +// Use the following in C++ +torch::jit::getExecutorMode() = false; +torch::jit::getProfilingMode() = false; +torch::jit::setGraphExecutorOptimize(false); +""" + +if __name__ == "__main__": + torch.manual_seed(20230104) + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa + logging.basicConfig(format=formatter, level=logging.INFO) + + main() +else: + torch.set_num_threads(1) + torch.set_num_interop_threads(1) diff --git a/sherpa/bin/pruned_stateless_emformer_rnnt2/beam_search.py b/sherpa/bin/pruned_stateless_emformer_rnnt2/beam_search.py deleted file mode 100644 index d104970e4..000000000 --- a/sherpa/bin/pruned_stateless_emformer_rnnt2/beam_search.py +++ /dev/null @@ -1,426 +0,0 @@ -from typing import List - -import k2 -import torch -from stream import Stream, stack_states, unstack_states - -from sherpa import ( - VALID_FAST_BEAM_SEARCH_METHOD, - Hypotheses, - Hypothesis, - Lexicon, - fast_beam_search_nbest, - fast_beam_search_nbest_LG, - fast_beam_search_one_best, - streaming_greedy_search, - streaming_modified_beam_search, -) - - -class FastBeamSearch: - def __init__( - self, - beam_search_params: dict, - device: torch.device, - ): - """ - Args: - beam_search_params - Dictionary containing all the parameters for beam search. - device: - Device on which the computation will occur - """ - - decoding_method = beam_search_params["decoding_method"] - assert ( - decoding_method in VALID_FAST_BEAM_SEARCH_METHOD - ), f"{decoding_method} is not a valid search method" - - self.decoding_method = decoding_method - self.rnnt_decoding_config = k2.RnntDecodingConfig( - vocab_size=beam_search_params["vocab_size"], - decoder_history_len=beam_search_params["context_size"], - beam=beam_search_params["beam"], - max_states=beam_search_params["max_states"], - max_contexts=beam_search_params["max_contexts"], - ) - if decoding_method == "fast_beam_search_nbest_LG": - lexicon = Lexicon(beam_search_params["lang_dir"]) - self.word_table = lexicon.word_table - lg_filename = beam_search_params["lang_dir"] / "LG.pt" - self.decoding_graph = k2.Fsa.from_dict( - torch.load(lg_filename, map_location=device) - ) - self.decoding_graph.scores *= beam_search_params["ngram_lm_scale"] - else: - self.decoding_graph = k2.trivial_graph( - beam_search_params["vocab_size"] - 1, device - ) - self.device = device - self.context_size = beam_search_params["context_size"] - self.beam_search_params = beam_search_params - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - stream.rnnt_decoding_stream = k2.RnntDecodingStream(self.decoding_graph) - stream.hyp = [] - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do search with fast_beam_search - method. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyp` are - updated in-place. - """ - model = server.model - device = model.device - # Note: chunk_length is in frames before subsampling - chunk_length = server.chunk_length - segment_length = server.segment_length - batch_size = len(stream_list) - - state_list, feature_list = [], [] - processed_frames_list, rnnt_decoding_streams_list = [], [] - - rnnt_decoding_config = self.rnnt_decoding_config - for s in stream_list: - rnnt_decoding_streams_list.append(s.rnnt_decoding_stream) - - state_list.append(s.states) - processed_frames_list.append(s.processed_frames) - f = s.features[:chunk_length] - s.features = s.features[segment_length:] - b = torch.cat(f, dim=0) - feature_list.append(b) - - features = torch.stack(feature_list, dim=0).to(device) - states = stack_states(state_list) - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=device, - dtype=torch.int64, - ) - - processed_frames = torch.tensor(processed_frames_list, device=device) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - states=states, - ) - - processed_lens = processed_frames + encoder_out_lens - if self.decoding_method == "fast_beam_search_nbest": - next_hyp_list, next_trailing_blank_frames = fast_beam_search_nbest( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - num_paths=self.beam_search_params["num_paths"], - nbest_scale=self.beam_search_params["nbest_scale"], - use_double_scores=True, - temperature=self.beam_search_params["temperature"], - ) - elif self.decoding_method == "fast_beam_search_nbest_LG": - ( - next_hyp_list, - next_trailing_blank_frames, - ) = fast_beam_search_nbest_LG( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - num_paths=self.beam_search_params["num_paths"], - nbest_scale=self.beam_search_params["nbest_scale"], - use_double_scores=True, - temperature=self.beam_search_params["temperature"], - ) - elif self.decoding_method == "fast_beam_search": - ( - next_hyp_list, - next_trailing_blank_frames, - ) = fast_beam_search_one_best( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - ) - else: - raise NotImplementedError( - f"{self.decoding_method} is not implemented" - ) - - next_state_list = unstack_states(next_states) - for i, s in enumerate(stream_list): - s.states = next_state_list[i] - s.processed_frames += encoder_out_lens[i] - s.hyp = next_hyp_list[i] - s.num_trailing_blank_frames = next_trailing_blank_frames[i] - - def get_texts(self, stream: Stream) -> str: - """ - Return text after decoding - Args: - stream: - Stream to be processed. - """ - if self.decoding_method == "fast_beam_search_nbest_LG": - result = [self.word_table[i] for i in stream.hyp] - result = " ".join(result) - else: - result = self.sp.decode(stream.hyp) - - return result - - -class GreedySearch: - def __init__( - self, - model: "RnntEmformerModel", - beam_search_params: dict, - device: torch.device, - ): - """ - Args: - model: - RNN-T model decoder model - beam_search_params: - Dictionary containing all the parameters for beam search. - device: - Device on which the computation will occur - """ - self.device = device - self.beam_search_params = beam_search_params - self.device = device - - decoder_input = torch.tensor( - [ - [self.beam_search_params["blank_id"]] - * self.beam_search_params["context_size"] - ], - device=self.device, - dtype=torch.int64, - ) - - initial_decoder_out = model.decoder_forward(decoder_input) - self.initial_decoder_out = model.forward_decoder_proj( - initial_decoder_out.squeeze(1) - ) - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - stream.decoder_out = self.initial_decoder_out - stream.hyp = [ - self.beam_search_params["blank_id"] - ] * self.beam_search_params["context_size"] - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do search with greedy_search - method. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyp` are - updated in-place. - """ - model = server.model - device = model.device - # Note: chunk_length is in frames before subsampling - chunk_length = server.chunk_length - batch_size = len(stream_list) - segment_length = server.segment_length - - state_list, feature_list = [], [] - decoder_out_list, hyp_list = [], [] - - num_trailing_blank_frames_list = [] - - for s in stream_list: - decoder_out_list.append(s.decoder_out) - hyp_list.append(s.hyp) - - state_list.append(s.states) - - f = s.features[:chunk_length] - s.features = s.features[segment_length:] - b = torch.cat(f, dim=0) - feature_list.append(b) - - num_trailing_blank_frames_list.append(s.num_trailing_blank_frames) - - features = torch.stack(feature_list, dim=0).to(device) - states = stack_states(state_list) - - decoder_out = torch.cat(decoder_out_list, dim=0) - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=device, - dtype=torch.int64, - ) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - states=states, - ) - - # Each stream has the same input number of frames, - # i.e., server.chunk_length. - ( - next_decoder_out, - next_hyp_list, - next_trailing_blank_frames, - ) = streaming_greedy_search( - model=model, - encoder_out=encoder_out, - decoder_out=decoder_out, - hyps=hyp_list, - num_trailing_blank_frames=num_trailing_blank_frames_list, - ) - - next_decoder_out_list = next_decoder_out.split(1) - - next_state_list = unstack_states(next_states) - for i, s in enumerate(stream_list): - s.states = next_state_list[i] - s.processed_frames += encoder_out_lens[i] - s.decoder_out = next_decoder_out_list[i] - s.hyp = next_hyp_list[i] - s.num_trailing_blank_frames = next_trailing_blank_frames[i] - - def get_texts(self, stream: Stream) -> str: - """ - Return text after decoding - Args: - stream: - Stream to be processed. - """ - hyp = stream.hyp[self.beam_search_params["context_size"] :] - return self.sp.decode(hyp) - - -class ModifiedBeamSearch: - def __init__(self, beam_search_params: dict): - self.beam_search_params = beam_search_params - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - hyp = [self.beam_search_params["blank_id"]] * self.beam_search_params[ - "context_size" - ] - stream.hyps = Hypotheses([Hypothesis(ys=hyp, log_prob=0.0)]) - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do modified_beam_search. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyps` are - updated in-place. - """ - model = server.model - device = model.device - - segment_length = server.segment_length - chunk_length = server.chunk_length - - batch_size = len(stream_list) - - state_list = [] - hyps_list = [] - feature_list = [] - for s in stream_list: - state_list.append(s.states) - hyps_list.append(s.hyps) - - f = s.features[:chunk_length] - s.features = s.features[segment_length:] - - b = torch.cat(f, dim=0) - feature_list.append(b) - - features = torch.stack(feature_list, dim=0).to(device) - states = stack_states(state_list) - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=device, - dtype=torch.int64, - ) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - states=states, - ) - # Note: There are no paddings for streaming ASR. Each stream - # has the same input number of frames, i.e., server.chunk_length. - next_hyps_list = streaming_modified_beam_search( - model=model, - encoder_out=encoder_out, - hyps=hyps_list, - num_active_paths=self.beam_search_params["num_active_paths"], - ) - - next_state_list = unstack_states(next_states) - for i, s in enumerate(stream_list): - s.states = next_state_list[i] - s.processed_frames += encoder_out_lens[i] - s.hyps = next_hyps_list[i] - trailing_blanks = s.hyps.get_most_probable(True).num_trailing_blanks - s.num_trailing_blank_frames = trailing_blanks - - def get_texts(self, stream: Stream) -> str: - hyp = stream.hyps.get_most_probable(True).ys[ - self.beam_search_params["context_size"] : - ] - return self.sp.decode(hyp) diff --git a/sherpa/bin/pruned_stateless_emformer_rnnt2/stream.py b/sherpa/bin/pruned_stateless_emformer_rnnt2/stream.py deleted file mode 100644 index 5f94c6074..000000000 --- a/sherpa/bin/pruned_stateless_emformer_rnnt2/stream.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import List - -import torch -from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature - -import sherpa - - -def unstack_states( - states: List[List[torch.Tensor]], -) -> List[List[List[torch.Tensor]]]: - """Unstack the Emformer state corresponding to a batch of utterances - into a list of states, where the i-th entry is the state for the i-th - utterance in the batch. - - Args: - states: - A list-of-list of tensors. ``len(states)`` equals to number of - layers in the Emformer. ``states[i]`` contains the states for - the i-th layer. ``states[i][k]`` is either a 3-D tensor of shape - ``(T, N, C)`` or a 2-D tensor of shape ``(C, N)`` - Returns: - Return the states for each utterance. ans[i] is the state for the i-th - utterance. Note that the returned state does not contain the batch - dimension. - """ - batch_size = states[0][0].size(1) - num_layers = len(states) - - ans = [None] * batch_size - for i in range(batch_size): - ans[i] = [[] for _ in range(num_layers)] - - for li, layer in enumerate(states): - for s in layer: - s_list = s.unbind(dim=1) - # We will use stack(dim=1) later in stack_states() - for bi, b in enumerate(ans): - b[li].append(s_list[bi]) - return ans - - -def stack_states( - state_list: List[List[List[torch.Tensor]]], -) -> List[List[torch.Tensor]]: - """Stack list of Emformer states that correspond to separate utterances - into a single Emformer state so that it can be used as an input for - Emformer when those utterances are formed into a batch. - - Note: - It is the inverse of :func:`unstack_states`. - - Args: - state_list: - Each element in state_list corresponds to the internal state - of the Emformer model for a single utterance. - Returns: - Return a new state corresponding to a batch of utterances. - See the input argument of :func:`unstack_states` for the meaning - of the returned tensor. - """ - batch_size = len(state_list) - ans = [] - for layer in state_list[0]: - # layer is a list of tensors - if batch_size > 1: - ans.append([[s] for s in layer]) - # Note: We will stack ans[layer][s][] later to get ans[layer][s] - else: - ans.append([s.unsqueeze(1) for s in layer]) - - for b, states in enumerate(state_list[1:], 1): - for li, layer in enumerate(states): - for si, s in enumerate(layer): - ans[li][si].append(s) - if b == batch_size - 1: - ans[li][si] = torch.stack(ans[li][si], dim=1) - # We will use unbind(dim=1) later in unstack_states() - return ans - - -def _create_streaming_feature_extractor() -> OnlineFeature: - """Create a CPU streaming feature extractor. - - At present, we assume it returns a fbank feature extractor with - fixed options. In the future, we will support passing in the options - from outside. - - Returns: - Return a CPU streaming feature extractor. - """ - opts = FbankOptions() - opts.device = "cpu" - opts.frame_opts.dither = 0 - opts.frame_opts.snip_edges = False - opts.frame_opts.samp_freq = 16000 - opts.mel_opts.num_bins = 80 - return OnlineFbank(opts) - - -class Stream(object): - def __init__( - self, - context_size: int, - subsampling_factor: int, - initial_states: List[List[torch.Tensor]], - ) -> None: - """ - Args: - context_size: - Context size of the RNN-T decoder model. - subsampling_factor: - Subsampling factor of the RNN-T encoder model. - initial_states: - The initial states of the Emformer model. Note that the state - does not contain the batch dimension. - """ - self.feature_extractor = _create_streaming_feature_extractor() - # It contains a list of 2-D tensors representing the feature frames. - # Each entry is of shape (1, feature_dim) - self.features: List[torch.Tensor] = [] - self.num_fetched_frames = 0 # before subsampling - - self.num_trailing_blank_frames = 0 # after subsampling - - self.states = initial_states - self.processed_frames = 0 # after subsampling - self.context_size = context_size - self.subsampling_factor = subsampling_factor - self.log_eps = math.log(1e-10) - - # whenever an endpoint is detected, it is incremented - self.segment = 0 - - def accept_waveform( - self, - sampling_rate: float, - waveform: torch.Tensor, - ) -> None: - """Feed audio samples to the feature extractor and compute features - if there are enough samples available. - - Caution: - The range of the audio samples should match the one used in the - training. That is, if you use the range [-1, 1] in the training, then - the input audio samples should also be normalized to [-1, 1]. - - Args - sampling_rate: - The sampling rate of the input audio samples. It is used for sanity - check to ensure that the input sampling rate equals to the one - used in the extractor. If they are not equal, then no resampling - will be performed; instead an error will be thrown. - waveform: - A 1-D torch tensor of dtype torch.float32 containing audio samples. - It should be on CPU. - """ - self.feature_extractor.accept_waveform( - sampling_rate=sampling_rate, - waveform=waveform, - ) - self._fetch_frames() - - def input_finished(self) -> None: - """Signal that no more audio samples available and the feature - extractor should flush the buffered samples to compute frames. - """ - self.feature_extractor.input_finished() - self._fetch_frames() - - def _fetch_frames(self) -> None: - """Fetch frames from the feature extractor""" - while self.num_fetched_frames < self.feature_extractor.num_frames_ready: - frame = self.feature_extractor.get_frame(self.num_fetched_frames) - self.features.append(frame) - self.num_fetched_frames += 1 - - def add_tail_paddings(self, n: int = 20) -> None: - """Add some tail paddings so that we have enough context to process - frames at the very end of an utterance. - - Args: - n: - Number of tail padding frames to be added. You can increase it if - it happens that there are many missing tokens for the last word of - an utterance. - """ - tail_padding = torch.full( - (1, self.feature_extractor.opts.mel_opts.num_bins), - fill_value=self.log_eps, - dtype=torch.float32, - ) - - self.features += [tail_padding] * n - - def endpoint_detected( - self, - config: sherpa.OnlineEndpointConfig, - ) -> bool: - """ - Args: - config: - Config for endpointing. - Returns: - Return True if endpoint is detected; return False otherwise. - """ - frame_shift_in_seconds = ( - self.feature_extractor.opts.frame_opts.frame_shift_ms / 1000 - ) - - trailing_silence_frames = ( - self.num_trailing_blank_frames * self.subsampling_factor - ) - - num_frames_decoded = self.processed_frames * self.subsampling_factor - - detected = sherpa.endpoint_detected( - config=config, - num_frames_decoded=num_frames_decoded, - trailing_silence_frames=trailing_silence_frames, - frame_shift_in_seconds=frame_shift_in_seconds, - ) - - if detected: - self.num_trailing_blank_frames = 0 - self.processed_frames = 0 - self.segment += 1 - - return detected diff --git a/sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py b/sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py deleted file mode 100755 index e172b70fc..000000000 --- a/sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_server.py +++ /dev/null @@ -1,601 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A server for streaming ASR recognition. By streaming it means the audio samples -are coming in real-time. You don't need to wait until all audio samples are -captured before sending them for recognition. - -It supports multiple clients sending at the same time. - -Usage: - ./streaming_server.py --help - - ./streaming_server.py - -Please refer to -https://k2-fsa.github.io/sherpa/python/streaming_asr/emformer/index.html -for details -""" - -import argparse -import asyncio -import http -import json -import logging -import math -import warnings -from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Tuple - -import numpy as np -import sentencepiece as spm -import torch -import websockets -from beam_search import FastBeamSearch, GreedySearch, ModifiedBeamSearch -from stream import Stream, unstack_states - -from sherpa import ( - OnlineEndpointConfig, - RnntEmformerModel, - add_beam_search_arguments, - add_online_endpoint_arguments, -) - - -def get_args(): - beam_search_parser = add_beam_search_arguments() - online_endpoint_parser = add_online_endpoint_arguments() - parser = argparse.ArgumentParser( - parents=[beam_search_parser, online_endpoint_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--port", - type=int, - default=6006, - help="The server will listen on this port", - ) - - parser.add_argument( - "--nn-model-filename", - type=str, - required=True, - help="""The torchscript model. You can use - icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \ - --jit=1 - to generate this model. - """, - ) - - parser.add_argument( - "--bpe-model-filename", - type=str, - help="""The BPE model - You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx - where xxx is the number of BPE tokens you used to train the model. - Note: You don't need to provide it if you provide `--token-filename`. - """, - ) - - parser.add_argument( - "--token-filename", - type=str, - help="""Filename for tokens.txt - For instance, you can find it in the directory - egs/aishell/ASR/data/lang_char/tokens.txt - or - egs/wenetspeech/ASR/data/lang_char/tokens.txt - from icefall - Note: You don't need to provide it if you provide `--bpe-model` - """, - ) - - parser.add_argument( - "--decode-chunk-size", - type=int, - default=8, - help="The chunk size for decoding (in frames after subsampling)", - ) - - parser.add_argument( - "--decode-left-context", - type=int, - default=32, - help="""left context can be seen during decoding - (in frames after subsampling)""", - ) - - parser.add_argument( - "--decode-right-context", - type=int, - default=2, - help="""right context can be seen during decoding - (in frames after subsampling)""", - ) - - parser.add_argument( - "--nn-pool-size", - type=int, - default=1, - help="Number of threads for NN computation and decoding.", - ) - - parser.add_argument( - "--max-batch-size", - type=int, - default=50, - help="""Max batch size for computation. Note if there are not enough - requests in the queue, it will wait for max_wait_ms time. After that, - even if there are not enough requests, it still sends the - available requests in the queue for computation. - """, - ) - - parser.add_argument( - "--max-wait-ms", - type=float, - default=10, - help="""Max time in millisecond to wait to build batches for inference. - If there are not enough requests in the stream queue to build a batch - of max_batch_size, it waits up to this time before fetching available - requests for computation. - """, - ) - - parser.add_argument( - "--max-message-size", - type=int, - default=(1 << 20), - help="""Max message size in bytes. - The max size per message cannot exceed this limit. - """, - ) - - parser.add_argument( - "--max-queue-size", - type=int, - default=32, - help="Max number of messages in the queue for each connection.", - ) - - parser.add_argument( - "--max-active-connections", - type=int, - default=500, - help="""Maximum number of active connections. The server will refuse - to accept new connections once the current number of active connections - equals to this limit. - """, - ) - - return ( - parser.parse_args(), - beam_search_parser.parse_known_args()[0], - online_endpoint_parser.parse_known_args()[0], - ) - - -class StreamingServer(object): - def __init__( - self, - nn_model_filename: str, - bpe_model_filename: str, - nn_pool_size: int, - max_wait_ms: float, - max_batch_size: int, - max_message_size: int, - max_queue_size: int, - max_active_connections: int, - beam_search_params: dict, - online_endpoint_config: OnlineEndpointConfig, - ): - """ - Args: - nn_model_filename: - Path to the torchscript model - bpe_model_filename: - Path to the BPE model - nn_pool_size: - Number of threads for the thread pool that is responsible for - neural network computation and decoding. - max_wait_ms: - Max wait time in milliseconds in order to build a batch of - `batch_size`. - max_batch_size: - Max batch size for inference. - max_message_size: - Max size in bytes per message. - max_queue_size: - Max number of messages in the queue for each connection. - max_active_connections: - Max number of active connections. Once number of active client - equals to this limit, the server refuses to accept new connections. - beam_search_params: - Dictionary containing all the parameters for beam search. - online_endpoint_config: - Config for endpointing. - """ - if torch.cuda.is_available(): - device = torch.device("cuda", 0) - else: - device = torch.device("cpu") - logging.info(f"Using device: {device}") - - self.model = RnntEmformerModel(nn_model_filename, device=device) - - # number of frames before subsampling - self.segment_length = self.model.segment_length - - self.right_context_length = self.model.right_context_length - - # We add 3 here since the subsampling method is using - # ((len - 1) // 2 - 1) // 2) - self.chunk_length = self.segment_length + 3 + self.right_context_length - - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model_filename) - - self.context_size = self.model.context_size - self.subsampling_factor = self.model.subsampling_factor - self.blank_id = self.model.blank_id - self.vocab_size = self.model.vocab_size - self.log_eps = math.log(1e-10) - - initial_states = self.model.get_encoder_init_states() - self.initial_states = unstack_states(initial_states)[0] - - # Add these params after loading the Emformer model - beam_search_params["vocab_size"] = self.vocab_size - beam_search_params["context_size"] = self.context_size - beam_search_params["blank_id"] = self.blank_id - - decoding_method = beam_search_params["decoding_method"] - if decoding_method.startswith("fast_beam_search"): - self.beam_search = FastBeamSearch( - beam_search_params=beam_search_params, - device=device, - ) - elif decoding_method == "greedy_search": - self.beam_search = GreedySearch( - self.model, - beam_search_params, - device, - ) - elif decoding_method == "modified_beam_search": - self.beam_search = ModifiedBeamSearch(beam_search_params) - else: - raise ValueError( - f"Decoding method {decoding_method} is not supported." - ) - - self.beam_search.sp = self.sp - self.online_endpoint_config = online_endpoint_config - - self.nn_pool = ThreadPoolExecutor( - max_workers=nn_pool_size, - thread_name_prefix="nn", - ) - - self.stream_queue = asyncio.Queue() - self.max_wait_ms = max_wait_ms - self.max_batch_size = max_batch_size - self.max_message_size = max_message_size - self.max_queue_size = max_queue_size - self.max_active_connections = max_active_connections - - self.current_active_connections = 0 - - async def warmup(self) -> None: - """Do warmup to the torchscript model to decrease the waiting time - of the first request. - - See https://github.com/k2-fsa/sherpa/pull/100 for details - """ - logging.info("Warmup start") - stream = Stream( - context_size=self.context_size, - subsampling_factor=self.subsampling_factor, - initial_states=self.initial_states, - ) - self.beam_search.init_stream(stream) - - samples = torch.rand(16000 * 1, dtype=torch.float32) # 1 second - stream.accept_waveform(sampling_rate=16000, waveform=samples) - - while len(stream.features) > self.chunk_length: - await self.compute_and_decode(stream) - - logging.info("Warmup done") - - async def stream_consumer_task(self): - """This function extracts streams from the queue, batches them up, sends - them to the RNN-T model for computation and decoding. - """ - while True: - if self.stream_queue.empty(): - await asyncio.sleep(self.max_wait_ms / 1000) - continue - - batch = [] - try: - while len(batch) < self.max_batch_size: - item = self.stream_queue.get_nowait() - - assert len(item[0].features) >= self.chunk_length, len( - item[0].features - ) - - batch.append(item) - except asyncio.QueueEmpty: - pass - stream_list = [b[0] for b in batch] - future_list = [b[1] for b in batch] - - loop = asyncio.get_running_loop() - await loop.run_in_executor( - self.nn_pool, - self.beam_search.process, - self, - stream_list, - ) - - for f in future_list: - self.stream_queue.task_done() - f.set_result(None) - - async def compute_and_decode( - self, - stream: Stream, - ) -> None: - """Put the stream into the queue and wait it to be processed by the - consumer task. - - Args: - stream: - The stream to be processed. Note: It is changed in-place. - """ - loop = asyncio.get_running_loop() - future = loop.create_future() - await self.stream_queue.put((stream, future)) - await future - - async def process_request( - self, - unused_path: str, - unused_request_headers: websockets.Headers, - ) -> Optional[Tuple[http.HTTPStatus, websockets.Headers, bytes]]: - if self.current_active_connections < self.max_active_connections: - self.current_active_connections += 1 - return None - - # Refuse new connections - status = http.HTTPStatus.SERVICE_UNAVAILABLE # 503 - header = {"Hint": "The server is overloaded. Please retry later."} - response = b"The server is busy. Please retry later." - - return status, header, response - - async def run(self, port: int): - task = asyncio.create_task(self.stream_consumer_task()) - await self.warmup() - - async with websockets.serve( - self.handle_connection, - host="", - port=port, - max_size=self.max_message_size, - max_queue=self.max_queue_size, - process_request=self.process_request, - ): - await asyncio.Future() # run forever - - await task # not reachable - - async def handle_connection( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and send - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - try: - await self.handle_connection_impl(socket) - finally: - # Decrement so that it can accept new connections - self.current_active_connections -= 1 - - logging.info( - f"Disconnected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - - async def handle_connection_impl( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and send - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - logging.info( - f"Connected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - stream = Stream( - context_size=self.context_size, - subsampling_factor=self.subsampling_factor, - initial_states=self.initial_states, - ) - - self.beam_search.init_stream(stream) - - while True: - samples = await self.recv_audio_samples(socket) - if samples is None: - break - - # TODO(fangjun): At present, we assume the sampling rate - # of the received audio samples is always 16000. - stream.accept_waveform(sampling_rate=16000, waveform=samples) - - while len(stream.features) > self.chunk_length: - await self.compute_and_decode(stream) - hyp = self.beam_search.get_texts(stream) - - segment = stream.segment - is_final = stream.endpoint_detected(self.online_endpoint_config) - - if is_final: - self.beam_search.init_stream(stream) - - message = { - "segment": segment, - "text": hyp, - "final": is_final, - } - - await socket.send(json.dumps(message)) - - stream.input_finished() - while len(stream.features) > self.chunk_length: - await self.compute_and_decode(stream) - - if len(stream.features) > 0: - n = self.chunk_length - len(stream.features) - stream.add_tail_paddings(n) - await self.compute_and_decode(stream) - stream.features = [] - - hyp = self.beam_search.get_texts(stream) - message = { - "segment": stream.segment, - "text": hyp, - "final": True, # end of connection, always set final to True - } - - await socket.send(json.dumps(message)) - - async def recv_audio_samples( - self, - socket: websockets.WebSocketServerProtocol, - ) -> Optional[torch.Tensor]: - """Receives a tensor from the client. - - Each message contains either a bytes buffer containing audio samples - in 16 kHz or contains b"Done" meaning the end of utterance. - - Args: - socket: - The socket for communicating with the client. - Returns: - Return a 1-D torch.float32 tensor containing the audio samples or - return None. - """ - message = await socket.recv() - if message == b"Done": - return None - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # PyTorch warns that the underlying buffer is not writable. - # We ignore it here as we are not going to write it anyway. - if hasattr(torch, "frombuffer"): - # Note: torch.frombuffer is available only in torch>= 1.10 - return torch.frombuffer(message, dtype=torch.float32) - else: - array = np.frombuffer(message, dtype=np.float32) - return torch.from_numpy(array) - - -@torch.no_grad() -def main(): - args, beam_search_parser, online_endpoint_parser = get_args() - - beam_search_params = vars(beam_search_parser) - logging.info(beam_search_params) - - online_endpoint_params = vars(online_endpoint_parser) - logging.info(online_endpoint_params) - - online_endpoint_config = OnlineEndpointConfig.from_args( - online_endpoint_params - ) - - logging.info(vars(args)) - - port = args.port - nn_model_filename = args.nn_model_filename - bpe_model_filename = args.bpe_model_filename - nn_pool_size = args.nn_pool_size - max_batch_size = args.max_batch_size - max_wait_ms = args.max_wait_ms - max_message_size = args.max_message_size - max_queue_size = args.max_queue_size - max_active_connections = args.max_active_connections - - if beam_search_params["decoding_method"] == "modified_beam_search": - assert beam_search_params["num_active_paths"] >= 1, beam_search_params[ - "num_active_paths" - ] - - server = StreamingServer( - nn_model_filename=nn_model_filename, - bpe_model_filename=bpe_model_filename, - nn_pool_size=nn_pool_size, - max_batch_size=max_batch_size, - max_wait_ms=max_wait_ms, - max_message_size=max_message_size, - max_queue_size=max_queue_size, - max_active_connections=max_active_connections, - beam_search_params=beam_search_params, - online_endpoint_config=online_endpoint_config, - ) - asyncio.run(server.run(port)) - - -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - -# See https://github.com/pytorch/pytorch/issues/38342 -# and https://github.com/pytorch/pytorch/issues/33354 -# -# If we don't do this, the delay increases whenever there is -# a new request that changes the actual batch size. -# If you use `py-spy dump --pid --native`, you will -# see a lot of time is spent in re-compiling the torch script model. -torch._C._jit_set_profiling_executor(False) -torch._C._jit_set_profiling_mode(False) -torch._C._set_graph_executor_optimize(False) -""" -// Use the following in C++ -torch::jit::getExecutorMode() = false; -torch::jit::getProfilingMode() = false; -torch::jit::setGraphExecutorOptimize(false); -""" - -if __name__ == "__main__": - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa - logging.basicConfig(format=formatter, level=logging.INFO) - main() diff --git a/sherpa/bin/pruned_transducer_statelessX/beam_search.py b/sherpa/bin/pruned_transducer_statelessX/beam_search.py deleted file mode 100644 index fb4b9dbec..000000000 --- a/sherpa/bin/pruned_transducer_statelessX/beam_search.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import math -from typing import List - -import torch -from torch.nn.utils.rnn import pad_sequence - -from sherpa import RnntConformerModel, greedy_search, modified_beam_search - -LOG_EPS = math.log(1e-10) - - -class GreedySearchOffline: - def __init__(self): - pass - - @torch.no_grad() - def process( - self, - model: "RnntConformerModel", - features: List[torch.Tensor], - ) -> List[List[int]]: - """ - Args: - model: - RNN-T model decoder model - - features: - A list of 2-D tensors. Each entry is of shape - (num_frames, feature_dim). - Returns: - Return a list-of-list containing the decoding token IDs. - """ - features_length = torch.tensor( - [f.size(0) for f in features], - dtype=torch.int64, - ) - features = pad_sequence( - features, - batch_first=True, - padding_value=LOG_EPS, - ) - - device = model.device - features = features.to(device) - features_length = features_length.to(device) - - encoder_out, encoder_out_length = model.encoder( - features=features, - features_length=features_length, - ) - - hyp_tokens = greedy_search( - model=model, - encoder_out=encoder_out, - encoder_out_length=encoder_out_length.cpu(), - ) - - return hyp_tokens - - -class ModifiedBeamSearchOffline: - def __init__(self, beam_search_params: dict): - """ - Args: - beam_search_params: - Dictionary containing all the parameters for beam search. - """ - self.beam_search_params = beam_search_params - - @torch.no_grad() - def process( - self, - model: "RnntConformerModel", - features: List[torch.Tensor], - ) -> List[List[int]]: - """Run RNN-T model with the given features and use greedy search - to decode the output of the model. - - Args: - model: - The RNN-T model. - features: - A list of 2-D tensors. Each entry is of shape - (num_frames, feature_dim). - Returns: - Return a list-of-list containing the decoding token IDs. - """ - features_length = torch.tensor( - [f.size(0) for f in features], - dtype=torch.int64, - ) - features = pad_sequence( - features, - batch_first=True, - padding_value=LOG_EPS, - ) - - device = model.device - features = features.to(device) - features_length = features_length.to(device) - - encoder_out, encoder_out_length = model.encoder( - features=features, - features_length=features_length, - ) - - hyp_tokens = modified_beam_search( - model=model, - encoder_out=encoder_out, - encoder_out_length=encoder_out_length.cpu(), - num_active_paths=self.beam_search_params["num_active_paths"], - ) - return hyp_tokens diff --git a/sherpa/bin/pruned_transducer_statelessX/offline_asr.py b/sherpa/bin/pruned_transducer_statelessX/offline_asr.py deleted file mode 100755 index d020d32d7..000000000 --- a/sherpa/bin/pruned_transducer_statelessX/offline_asr.py +++ /dev/null @@ -1,406 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A standalone script for offline ASR recognition. - -It loads a torchscript model, decodes the given wav files, and exits. - -Usage: - ./offline_asr.py --help - -For BPE based models (e.g., LibriSpeech): - - ./offline_asr.py \ - --nn-model-filename /path/to/cpu_jit.pt \ - --bpe-model-filename /path/to/bpe.model \ - --decoding-method greedy_search \ - ./foo.wav \ - ./bar.wav \ - ./foobar.wav - -For character based models (e.g., aishell): - - ./offline.py \ - --nn-model-filename /path/to/cpu_jit.pt \ - --token-filename /path/to/lang_char/tokens.txt \ - --decoding-method greedy_search \ - ./foo.wav \ - ./bar.wav \ - ./foobar.wav - -Note: We provide pre-trained models for testing. - -(1) Pre-trained model with the LibriSpeech dataset - - sudo apt-get install git-lfs - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 - - nn_model_filename=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit-torch-1.6.0.pt - bpe_model=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model - - wav1=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav - wav2=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav - wav3=./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav - - sherpa/bin/pruned_transducer_statelessX/offline_asr.py \ - --nn-model-filename $nn_model_filename \ - --bpe-model $bpe_model \ - $wav1 \ - $wav2 \ - $wav3 - -(2) Pre-trained model with the aishell dataset - - sudo apt-get install git-lfs - git lfs install - git clone https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 - - nn_model_filename=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/exp/cpu_jit-epoch-29-avg-5-torch-1.6.0.pt - token_filename=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/data/lang_char/tokens.txt - - wav1=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0121.wav - wav2=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0122.wav - wav3=./icefall-aishell-pruned-transducer-stateless3-2022-06-20/test_wavs/BAC009S0764W0123.wav - - sherpa/bin/pruned_transducer_statelessX/offline_asr.py \ - --nn-model-filename $nn_model_filename \ - --token-filename $token_filename \ - $wav1 \ - $wav2 \ - $wav3 -""" # noqa -import argparse -import logging -from typing import List, Optional, Union - -import k2 -import kaldifeat -import sentencepiece as spm -import torch -import torchaudio -from beam_search import GreedySearchOffline, ModifiedBeamSearchOffline - -from sherpa import RnntConformerModel, add_beam_search_arguments - - -def get_args(): - beam_search_parser = add_beam_search_arguments() - parser = argparse.ArgumentParser( - parents=[beam_search_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--nn-model-filename", - type=str, - required=True, - help="""The torchscript model. You can use - icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \ - --jit=1 - to generate this model. - """, - ) - - parser.add_argument( - "--bpe-model-filename", - type=str, - help="""The BPE model - You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx - from icefall, - where xxx is the number of BPE tokens you used to train the model. - Note: Use it only when your model is using BPE. You don't need to - provide it if you provide `--token-filename` - """, - ) - - parser.add_argument( - "--token-filename", - type=str, - help="""Filename for tokens.txt - You can find it in the directory - egs/aishell/ASR/data/lang_char/tokens.txt from icefall. - Note: You don't need to provide it if you provide `--bpe-model` - """, - ) - - parser.add_argument( - "--sample-rate", - type=int, - default=16000, - help="The expected sample rate of the input sound files", - ) - - parser.add_argument( - "sound_files", - type=str, - nargs="+", - help="The input sound file(s) to transcribe. " - "Supported formats are those supported by torchaudio.load(). " - "For example, wav and flac are supported. " - "The sample rate has to equal to `--sample-rate`.", - ) - - return ( - parser.parse_args(), - beam_search_parser.parse_known_args()[0], - ) - - -def read_sound_files( - filenames: List[str], - expected_sample_rate: int, -) -> List[torch.Tensor]: - """Read a list of sound files into a list 1-D float32 torch tensors. - Args: - filenames: - A list of sound filenames. - expected_sample_rate: - The expected sample rate of the sound files. - Returns: - Return a list of 1-D float32 torch tensors. - """ - ans = [] - for f in filenames: - wave, sample_rate = torchaudio.load(f) - assert sample_rate == expected_sample_rate, ( - f"expected sample rate: {expected_sample_rate}. " - f"Given: {sample_rate}" - ) - # We use only the first channel - ans.append(wave[0]) - return ans - - -class OfflineAsr(object): - def __init__( - self, - nn_model_filename: str, - bpe_model_filename: Optional[str], - token_filename: Optional[str], - num_active_paths: int, - sample_rate: int = 16000, - device: Union[str, torch.device] = "cpu", - beam_search_params: dict = {}, - ): - """ - Args: - nn_model_filename: - Path to the torch script model. - bpe_model_filename: - Path to the BPE model. If it is None, you have to provide - `token_filename`. - token_filename: - Path to tokens.txt. If it is None, you have to provide - `bpe_model_filename`. - num_active_paths: - Used only when decoding_method is modified_beam_search. - It specifies number of active paths for each utterance. Due to - merging paths with identical token sequences, the actual number - may be less than "num_active_paths". - sample_rate: - Expected sample rate of the feature extractor. - device: - The device to use for computation. - beam_search_params: - Dictionary containing all the parameters for beam search. - """ - self.model = RnntConformerModel( - filename=nn_model_filename, - device=device, - optimize_for_inference=False, - ) - - if bpe_model_filename: - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model_filename) - else: - self.token_table = k2.SymbolTable.from_file(token_filename) - - self.feature_extractor = self._build_feature_extractor( - sample_rate=sample_rate, - device=device, - ) - - decoding_method = beam_search_params["decoding_method"] - if decoding_method == "greedy_search": - self.beam_search = GreedySearchOffline() - elif decoding_method == "modified_beam_search": - self.beam_search = ModifiedBeamSearchOffline(beam_search_params) - else: - raise ValueError( - f"Decoding method {decoding_method} is not supported." - ) - - self.device = device - - def _build_feature_extractor( - self, - sample_rate: int = 16000, - device: Union[str, torch.device] = "cpu", - ) -> kaldifeat.OfflineFeature: - """Build a fbank feature extractor for extracting features. - - Args: - sample_rate: - Expected sample rate of the feature extractor. - device: - The device to use for computation. - Returns: - Return a fbank feature extractor. - """ - opts = kaldifeat.FbankOptions() - opts.device = device - opts.frame_opts.dither = 0 - opts.frame_opts.snip_edges = False - opts.frame_opts.samp_freq = sample_rate - opts.mel_opts.num_bins = 80 - - fbank = kaldifeat.Fbank(opts) - - return fbank - - def decode_waves(self, waves: List[torch.Tensor]) -> List[List[str]]: - """ - Args: - waves: - A list of 1-D torch.float32 tensors containing audio samples. - wavs[i] contains audio samples for the i-th utterance. - - Note: - Whether it should be in the range [-32768, 32767] or be normalized - to [-1, 1] depends on which range you used for your training data. - For instance, if your training data used [-32768, 32767], - then the given waves have to contain samples in this range. - - All models trained in icefall use the normalized range [-1, 1]. - Returns: - Return a list of decoded results. `ans[i]` contains the decoded - results for `wavs[i]`. - """ - waves = [w.to(self.device) for w in waves] - features = self.feature_extractor(waves) - - tokens = self.beam_search.process(self.model, features) - - if hasattr(self, "sp"): - results = self.sp.decode(tokens) - else: - results = [[self.token_table[i] for i in hyp] for hyp in tokens] - results = ["".join(r) for r in results] - - return results - - -@torch.no_grad() -def main(): - args, beam_search_parser = get_args() - beam_search_params = vars(beam_search_parser) - logging.info(vars(args)) - - nn_model_filename = args.nn_model_filename - bpe_model_filename = args.bpe_model_filename - token_filename = args.token_filename - num_active_paths = args.num_active_paths - sample_rate = args.sample_rate - sound_files = args.sound_files - - decoding_method = beam_search_params["decoding_method"] - assert decoding_method in ( - "greedy_search", - "modified_beam_search", - ), decoding_method - - if decoding_method == "modified_beam_search": - assert num_active_paths >= 1, num_active_paths - - if bpe_model_filename: - assert token_filename is None, ( - "You need to provide either --bpe-model-filename or " - "--token-filename parameter. But not both." - ) - - if token_filename: - assert bpe_model_filename is None, ( - "You need to provide either --bpe-model-filename or " - "--token-filename parameter. But not both." - ) - - assert bpe_model_filename or token_filename, ( - "You need to provide either --bpe-model-filename or " - "--token-filename parameter. But not both." - ) - - device = torch.device("cpu") - if torch.cuda.is_available(): - device = torch.device("cuda", 0) - - logging.info(f"device: {device}") - - offline_asr = OfflineAsr( - nn_model_filename=nn_model_filename, - bpe_model_filename=bpe_model_filename, - token_filename=token_filename, - num_active_paths=num_active_paths, - sample_rate=sample_rate, - device=device, - beam_search_params=beam_search_params, - ) - - waves = read_sound_files( - filenames=sound_files, - expected_sample_rate=sample_rate, - ) - - logging.info("Decoding started.") - - hyps = offline_asr.decode_waves(waves) - - s = "\n" - for filename, hyp in zip(sound_files, hyps): - s += f"{filename}:\n{hyp}\n\n" - logging.info(s) - - logging.info("Decoding done.") - - -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - -# See https://github.com/pytorch/pytorch/issues/38342 -# and https://github.com/pytorch/pytorch/issues/33354 -# -# If we don't do this, the delay increases whenever there is -# a new request that changes the actual batch size. -# If you use `py-spy dump --pid --native`, you will -# see a lot of time is spent in re-compiling the torch script model. -torch._C._jit_set_profiling_executor(False) -torch._C._jit_set_profiling_mode(False) -torch._C._set_graph_executor_optimize(False) -""" -// Use the following in C++ -torch::jit::getExecutorMode() = false; -torch::jit::getProfilingMode() = false; -torch::jit::setGraphExecutorOptimize(false); -""" - -if __name__ == "__main__": - torch.manual_seed(20220609) - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa - logging.basicConfig(format=formatter, level=logging.INFO) - - main() diff --git a/sherpa/bin/pruned_transducer_statelessX/offline_server.py b/sherpa/bin/pruned_transducer_statelessX/offline_server.py deleted file mode 100755 index bd830b4c7..000000000 --- a/sherpa/bin/pruned_transducer_statelessX/offline_server.py +++ /dev/null @@ -1,653 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A server for offline ASR recognition. Offline means you send all the content -of the audio for recognition. It supports multiple clients sending at -the same time. - -Usage: - ./offline_server.py --help - - ./offline_server.py -""" - -import argparse -import asyncio -import http -import logging -import warnings -from concurrent.futures import ThreadPoolExecutor -from typing import List, Optional, Tuple - -import k2 -import kaldifeat -import numpy as np -import sentencepiece as spm -import torch -import websockets -from beam_search import GreedySearchOffline, ModifiedBeamSearchOffline - -from sherpa import RnntConformerModel, add_beam_search_arguments - - -def get_args(): - beam_search_parser = add_beam_search_arguments() - parser = argparse.ArgumentParser( - parents=[beam_search_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--port", - type=int, - default=6006, - help="The server will listen on this port", - ) - - parser.add_argument( - "--num-device", - type=int, - default=1, - help="""Number of GPU devices to use. Set it to 0 to use CPU - for computation. If positive, then GPUs with ID 0, 1, ..., num_device-1 - will be used for computation. You can use the environment variable - CUDA_VISIBLE_DEVICES to map available GPU devices. - """, - ) - - parser.add_argument( - "--max-batch-size", - type=int, - default=25, - help="""Max batch size for computation. Note if there are not enough - requests in the queue, it will wait for max_wait_ms time. After that, - even if there are not enough requests, it still sends the - available requests in the queue for computation. - """, - ) - - parser.add_argument( - "--max-wait-ms", - type=float, - default=5, - help="""Max time in millisecond to wait to build batches for inference. - If there are not enough requests in the feature queue to build a batch - of max_batch_size, it waits up to this time before fetching available - requests for computation. - """, - ) - - parser.add_argument( - "--feature-extractor-pool-size", - type=int, - default=5, - help="""Number of threads for feature extraction. By default, feature - extraction are run on CPU. - """, - ) - - parser.add_argument( - "--nn-pool-size", - type=int, - default=1, - help="""Number of threads for NN computation and decoding. - Note: It should be in general less than or equal to num_device - if num_device is positive. - """, - ) - - parser.add_argument( - "--nn-model-filename", - type=str, - required=True, - help="""The torchscript model. You can use - icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \ - --jit=1 - to generate this model. - """, - ) - - parser.add_argument( - "--bpe-model-filename", - type=str, - help="""The BPE model - You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx - from icefall, - where xxx is the number of BPE tokens you used to train the model. - Note: Use it only when your model is using BPE. You don't need to - provide it if you provide `--token-filename` - """, - ) - - parser.add_argument( - "--token-filename", - type=str, - help="""Filename for tokens.txt - You can find it in the directory - egs/aishell/ASR/data/lang_char/tokens.txt from icefall. - Note: You don't need to provide it if you provide `--bpe-model` - """, - ) - - parser.add_argument( - "--max-message-size", - type=int, - default=(1 << 20), - help="""Max message size in bytes. - The max size per message cannot exceed this limit. - """, - ) - - parser.add_argument( - "--max-queue-size", - type=int, - default=32, - help="Max number of messages in the queue for each connection.", - ) - - parser.add_argument( - "--max-active-connections", - type=int, - default=500, - help="""Maximum number of active connections. The server will refuse - to accept new connections once the current number of active connections - equals to this limit. - """, - ) - - return ( - parser.parse_args(), - beam_search_parser.parse_known_args()[0], - ) - - -class OfflineServer: - def __init__( - self, - nn_model_filename: str, - bpe_model_filename: Optional[str], - token_filename: Optional[str], - num_device: int, - batch_size: int, - max_wait_ms: float, - feature_extractor_pool_size: int, - nn_pool_size: int, - max_message_size: int, - max_queue_size: int, - max_active_connections: int, - beam_search_params: dict, - ): - """ - Args: - nn_model_filename: - Path to the torch script model. - bpe_model_filename: - Path to the BPE model. If it is None, you have to provide - `token_filename`. - token_filename: - Path to tokens.txt. If it is None, you have to provide - `bpe_model_filename`. - num_device: - If 0, use CPU for neural network computation and decoding. - If positive, it means the number of GPUs to use for NN computation - and decoding. For each device, there will be a corresponding - torchscript model. We assume available device IDs are - 0, 1, ... , num_device - 1. You can use the environment variable - CUDA_VISIBLE_DEVICES to achieve this. - batch_size: - Max batch size for inference. - max_wait_ms: - Max wait time in milliseconds in order to build a batch of - `batch_size`. - feature_extractor_pool_size: - Number of threads to create for the feature extractor thread pool. - nn_pool_size: - Number of threads for the thread pool that is used for NN - computation and decoding. - max_message_size: - Max size in bytes per message. - max_queue_size: - Max number of messages in the queue for each connection. - max_active_connections: - Max number of active connections. Once number of active client - equals to this limit, the server refuses to accept new connections. - beam_search_params: - Dictionary containing all the parameters for beam search. - """ - self.feature_extractor = self._build_feature_extractor() - self.nn_models = self._build_nn_model(nn_model_filename, num_device) - - assert nn_pool_size > 0 - - self.feature_extractor_pool = ThreadPoolExecutor( - max_workers=feature_extractor_pool_size, - thread_name_prefix="feature", - ) - self.nn_pool = ThreadPoolExecutor( - max_workers=nn_pool_size, - thread_name_prefix="nn", - ) - - self.feature_queue = asyncio.Queue() - - if bpe_model_filename: - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model_filename) - else: - self.token_table = k2.SymbolTable.from_file(token_filename) - - self.counter = 0 - - self.max_wait_ms = max_wait_ms - self.batch_size = batch_size - - self.max_message_size = max_message_size - self.max_queue_size = max_queue_size - self.max_active_connections = max_active_connections - - self.current_active_connections = 0 - - decoding_method = beam_search_params["decoding_method"] - if decoding_method == "greedy_search": - self.beam_search = GreedySearchOffline() - elif decoding_method == "modified_beam_search": - self.beam_search = ModifiedBeamSearchOffline(beam_search_params) - else: - raise ValueError( - f"Decoding method {decoding_method} is not supported." - ) - - def _build_feature_extractor(self) -> kaldifeat.OfflineFeature: - """Build a fbank feature extractor for extracting features. - - TODO: - Pass the options as arguments - """ - opts = kaldifeat.FbankOptions() - opts.device = "cpu" # Note: It also supports CUDA, e.g., "cuda:0" - opts.frame_opts.dither = 0 - opts.frame_opts.snip_edges = False - opts.frame_opts.samp_freq = 16000 - opts.mel_opts.num_bins = 80 - - fbank = kaldifeat.Fbank(opts) - - return fbank - - def _build_nn_model( - self, nn_model_filename: str, num_device: int - ) -> List[RnntConformerModel]: - """Build a torch script model for each given device. - - Args: - nn_model_filename: - The path to the torch script model. - num_device: - Number of devices to use for NN computation and decoding. - If it is 0, then only use CPU and it returns a model on CPU. - If it is positive, it create a model for each device and returns - them. - Returns: - Return a list of torch script models. - """ - if num_device < 1: - model = RnntConformerModel( - filename=nn_model_filename, - device="cpu", - optimize_for_inference=False, - ) - return [model] - - ans = [] - for i in range(num_device): - device = torch.device("cuda", i) - model = RnntConformerModel( - filename=nn_model_filename, - device=device, - optimize_for_inference=False, - ) - ans.append(model) - - return ans - - async def warmup(self) -> None: - """Do warmup to the torchscript model to decrease the waiting time - of the first request. - - See https://github.com/k2-fsa/sherpa/pull/100 for details - """ - logging.info("Warmup start") - - samples = torch.rand(16000 * 1, dtype=torch.float32) # 1 second - features = await self.compute_features(samples) - await self.compute_and_decode(features) - logging.info("Warmup done") - - async def process_request( - self, - unused_path: str, - unused_request_headers: websockets.Headers, - ) -> Optional[Tuple[http.HTTPStatus, websockets.Headers, bytes]]: - if self.current_active_connections < self.max_active_connections: - self.current_active_connections += 1 - return None - - # Refuse new connections - status = http.HTTPStatus.SERVICE_UNAVAILABLE # 503 - header = {"Hint": "The server is overloaded. Please retry later."} - response = b"The server is busy. Please retry later." - - return status, header, response - - async def run(self, port: int): - logging.info("started") - task = asyncio.create_task(self.feature_consumer_task()) - await self.warmup() - - # If you use multiple GPUs, you can create multiple - # feature consumer tasks. - # asyncio.create_task(self.feature_consumer_task()) - # asyncio.create_task(self.feature_consumer_task()) - async with websockets.serve( - self.handle_connection, - host="", - port=port, - max_size=self.max_message_size, - max_queue=self.max_queue_size, - process_request=self.process_request, - ): - await asyncio.Future() # run forever - await task - - async def recv_audio_samples( - self, - socket: websockets.WebSocketServerProtocol, - ) -> Optional[torch.Tensor]: - """Receives a tensor from the client. - - As the websocket protocol is a message based protocol, not a stream - protocol, we can receive the whole message sent by the client at once. - - The message from the client is a **bytes** buffer. - - The first message can be either b"Done" meaning the client won't send - anything in the future or it can be a buffer containing 8 bytes - in **little** endian format, specifying the number of bytes in the audio - file, which will be sent by the client in the subsequent messages. - Since there is a limit in the message size posed by the websocket - protocol, the client may send the audio file in multiple messages if the - audio file is very large. - - The second and remaining messages contain audio samples. - - Args: - socket: - The socket for communicating with the client. - Returns: - Return a 1-D torch.float32 tensor containing the audio samples or - return None indicating the end of utterance. - """ - header = await socket.recv() - if header == b"Done": - return None - - assert len(header) == 8, "The first message should contain 8 bytes" - - expected_num_bytes = int.from_bytes(header, "little", signed=True) - - received = [] - num_received_bytes = 0 - async for message in socket: - received.append(message) - num_received_bytes += len(message) - - if num_received_bytes >= expected_num_bytes: - break - - assert num_received_bytes == expected_num_bytes, ( - num_received_bytes, - expected_num_bytes, - ) - - samples = b"".join(received) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # PyTorch warns that the underlying buffer is not writable. - # We ignore it here as we are not going to write it anyway. - if hasattr(torch, "frombuffer"): - # Note: torch.frombuffer is available only in torch>= 1.10 - return torch.frombuffer(samples, dtype=torch.float32) - else: - array = np.frombuffer(samples, dtype=np.float32) - return torch.from_numpy(array) - - async def feature_consumer_task(self): - """This function extracts features from the feature_queue, - batches them up, sends them to the RNN-T model for computation - and decoding. - """ - while True: - if self.feature_queue.empty(): - await asyncio.sleep(self.max_wait_ms / 1000) - continue - batch = [] - try: - while len(batch) < self.batch_size: - item = self.feature_queue.get_nowait() - batch.append(item) - except asyncio.QueueEmpty: - pass - - feature_list = [b[0] for b in batch] - - loop = asyncio.get_running_loop() - self.counter = (self.counter + 1) % len(self.nn_models) - model = self.nn_models[self.counter] - - hyp_tokens = await loop.run_in_executor( - self.nn_pool, - self.beam_search.process, - model, - feature_list, - ) - - for i, hyp in enumerate(hyp_tokens): - self.feature_queue.task_done() - future = batch[i][1] - loop.call_soon(future.set_result, hyp) - - async def compute_features(self, samples: torch.Tensor) -> torch.Tensor: - """Compute the fbank features for the given audio samples. - - Args: - samples: - A 1-D torch.float32 tensor containing the audio samples. Its - sampling rate should be the one as expected by the feature - extractor. Also, its range should match the one used in the - training. - Returns: - Return a 2-D tensor of shape (num_frames, feature_dim) containing - the features. - """ - loop = asyncio.get_running_loop() - return await loop.run_in_executor( - self.feature_extractor_pool, - self.feature_extractor, # it releases the GIL - samples, - ) - - async def compute_and_decode( - self, - features: torch.Tensor, - ) -> List[int]: - """Run the RNN-T model on the features and do greedy search. - - Args: - features: - A 2-D tensor of shape (num_frames, feature_dim). - Returns: - Return a list of token IDs containing the decoded results. - """ - loop = asyncio.get_running_loop() - future = loop.create_future() - await self.feature_queue.put((features, future)) - await future - return future.result() - - async def handle_connection( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and sends - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - try: - await self.handle_connection_impl(socket) - finally: - # Decrement so that it can accept new connections - self.current_active_connections -= 1 - - logging.info( - f"Disconnected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - - async def handle_connection_impl( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and sends - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - logging.info( - f"Connected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - - while True: - samples = await self.recv_audio_samples(socket) - if samples is None: - break - features = await self.compute_features(samples) - hyp = await self.compute_and_decode(features) - if hasattr(self, "sp"): - result = self.sp.decode(hyp) - else: - result = [self.token_table[i] for i in hyp] - await socket.send(result) - - -@torch.no_grad() -def main(): - args, beam_search_parser = get_args() - beam_search_params = vars(beam_search_parser) - - logging.info(vars(args)) - - nn_model_filename = args.nn_model_filename - bpe_model_filename = args.bpe_model_filename - token_filename = args.token_filename - port = args.port - num_device = args.num_device - max_wait_ms = args.max_wait_ms - batch_size = args.max_batch_size - feature_extractor_pool_size = args.feature_extractor_pool_size - nn_pool_size = args.nn_pool_size - max_message_size = args.max_message_size - max_queue_size = args.max_queue_size - max_active_connections = args.max_active_connections - - decoding_method = beam_search_params["decoding_method"] - assert decoding_method in ( - "greedy_search", - "modified_beam_search", - ), decoding_method - - if decoding_method == "modified_beam_search": - assert beam_search_params["num_active_paths"] >= 1, beam_search_params[ - "num_active_paths" - ] - - if bpe_model_filename: - assert token_filename is None, ( - "You need to provide either --bpe-model-filename or " - "--token-filename parameter. But not both." - ) - - if token_filename: - assert bpe_model_filename is None, ( - "You need to provide either --bpe-model-filename or " - "--token-filename parameter. But not both." - ) - - assert bpe_model_filename or token_filename, ( - "You need to provide either --bpe-model-filename or " - "--token-filename parameter. But not both." - ) - - offline_server = OfflineServer( - nn_model_filename=nn_model_filename, - bpe_model_filename=bpe_model_filename, - token_filename=token_filename, - num_device=num_device, - max_wait_ms=max_wait_ms, - batch_size=batch_size, - feature_extractor_pool_size=feature_extractor_pool_size, - nn_pool_size=nn_pool_size, - max_message_size=max_message_size, - max_queue_size=max_queue_size, - max_active_connections=max_active_connections, - beam_search_params=beam_search_params, - ) - asyncio.run(offline_server.run(port)) - - -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - -# See https://github.com/pytorch/pytorch/issues/38342 -# and https://github.com/pytorch/pytorch/issues/33354 -# -# If we don't do this, the delay increases whenever there is -# a new request that changes the actual batch size. -# If you use `py-spy dump --pid --native`, you will -# see a lot of time is spent in re-compiling the torch script model. -torch._C._jit_set_profiling_executor(False) -torch._C._jit_set_profiling_mode(False) -torch._C._set_graph_executor_optimize(False) -""" -// Use the following in C++ -torch::jit::getExecutorMode() = false; -torch::jit::getProfilingMode() = false; -torch::jit::setGraphExecutorOptimize(false); -""" - -if __name__ == "__main__": - torch.manual_seed(20220519) - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa - logging.basicConfig(format=formatter, level=logging.INFO) - - main() diff --git a/sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py b/sherpa/bin/streaming_client.py similarity index 78% rename from sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py rename to sherpa/bin/streaming_client.py index 1ebc449ff..3f4194b2e 100755 --- a/sherpa/bin/pruned_stateless_emformer_rnnt2/streaming_client.py +++ b/sherpa/bin/streaming_client.py @@ -1,22 +1,8 @@ #!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright 2022-2023 Xiaomi Corp. """ -A client for streaming ASR recognition. +A client for streaming ASR. Usage: ./streaming_client.py \ @@ -26,6 +12,8 @@ /path/to/bar.wav (Note: You have to first start the server before starting the client) +See ./streaming_server.py +for how to start the server """ import argparse import asyncio @@ -75,12 +63,23 @@ async def receive_results(socket: websockets.WebSocketServerProtocol): async for message in socket: result = json.loads(message) + method = result["method"] segment = result["segment"] is_final = result["final"] text = result["text"] + tokens = result["tokens"] + timestamps = result["timestamps"] if is_final: - ans.append(dict(segment=segment, text=text)) + ans.append( + dict( + method=method, + segment=segment, + text=text, + tokens=tokens, + timestamps=timestamps, + ) + ) logging.info(f"Final result of segment {segment}: {text}") continue @@ -100,7 +99,10 @@ async def run(server_addr: str, server_port: int, test_wav: str): ) as websocket: # noqa logging.info(f"Sending {test_wav}") wave, sample_rate = torchaudio.load(test_wav) - assert sample_rate == 16000, sample_rate + # You have to ensure that sample_rate equals to + # the argument --audio-sample-rate that you used to + # start streaming_server.py + logging.info(f"sample_rate: {sample_rate}") wave = wave.squeeze(0) receive_task = asyncio.create_task(receive_results(websocket)) @@ -117,12 +119,20 @@ async def run(server_addr: str, server_port: int, test_wav: str): start += frame_size - await websocket.send(b"Done") + await websocket.send("Done") decoding_results = await receive_task s = "" for r in decoding_results: + s += f"method: {r['method']}\n" s += f"segment: {r['segment']}\n" s += f"text: {r['text']}\n" + + token_time = [] + for token, time in zip(r["tokens"], r["timestamps"]): + token_time.append((token, time)) + + s += f"timestamps: {r['timestamps']}\n" + s += f"(token, time): {token_time}\n" logging.info(f"{test_wav}\n{s}") diff --git a/sherpa/bin/streaming_pruned_transducer_statelessX/beam_search.py b/sherpa/bin/streaming_pruned_transducer_statelessX/beam_search.py deleted file mode 100644 index 593f8bd0a..000000000 --- a/sherpa/bin/streaming_pruned_transducer_statelessX/beam_search.py +++ /dev/null @@ -1,380 +0,0 @@ -from typing import List - -import k2 -import torch -from stream import Stream - -from sherpa import ( - VALID_FAST_BEAM_SEARCH_METHOD, - Lexicon, - fast_beam_search_nbest, - fast_beam_search_nbest_LG, - fast_beam_search_one_best, - streaming_greedy_search, -) - - -class FastBeamSearch: - def __init__( - self, - beam_search_params: dict, - device: torch.device, - ): - """ - Args: - beam_search_params - Dictionary containing all the parameters for beam search. - device: - Device on which the computation will occur - """ - - decoding_method = beam_search_params["decoding_method"] - assert ( - decoding_method in VALID_FAST_BEAM_SEARCH_METHOD - ), f"{decoding_method} is not a valid search method" - - self.decoding_method = decoding_method - self.rnnt_decoding_config = k2.RnntDecodingConfig( - vocab_size=beam_search_params["vocab_size"], - decoder_history_len=beam_search_params["context_size"], - beam=beam_search_params["beam"], - max_states=beam_search_params["max_states"], - max_contexts=beam_search_params["max_contexts"], - ) - if decoding_method == "fast_beam_search_nbest_LG": - lexicon = Lexicon(beam_search_params["lang_dir"]) - self.word_table = lexicon.word_table - lg_filename = beam_search_params["lang_dir"] / "LG.pt" - self.decoding_graph = k2.Fsa.from_dict( - torch.load(lg_filename, map_location=device) - ) - self.decoding_graph.scores *= beam_search_params["ngram_lm_scale"] - else: - self.decoding_graph = k2.trivial_graph( - beam_search_params["vocab_size"] - 1, device - ) - self.device = device - self.context_size = beam_search_params["context_size"] - self.beam_search_params = beam_search_params - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - stream.rnnt_decoding_stream = k2.RnntDecodingStream(self.decoding_graph) - stream.hyp = [] - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do search with fast_beam_search - method. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyp` are - updated in-place. - """ - model = server.model - # Note: chunk_length is in frames before subsampling - chunk_length = server.chunk_length - subsampling_factor = server.subsampling_factor - # Note: chunk_size, left_context and right_context are in frames - # after subsampling - chunk_size = server.decode_chunk_size - left_context = server.decode_left_context - right_context = server.decode_right_context - - batch_size = len(stream_list) - - state_list = [] - feature_list = [] - processed_frames_list = [] - - rnnt_decoding_streams_list = [] - rnnt_decoding_config = self.rnnt_decoding_config - for s in stream_list: - rnnt_decoding_streams_list.append(s.rnnt_decoding_stream) - state_list.append(s.states) - processed_frames_list.append(s.processed_frames) - f = s.features[:chunk_length] - s.features = s.features[chunk_size * subsampling_factor :] - b = torch.cat(f, dim=0) - feature_list.append(b) - - features = torch.stack(feature_list, dim=0).to(self.device) - - states = [ - torch.stack([x[0] for x in state_list], dim=2), - torch.stack([x[1] for x in state_list], dim=2), - ] - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=self.device, - dtype=torch.int64, - ) - - processed_frames = torch.tensor( - processed_frames_list, device=self.device - ) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - states=states, - processed_frames=processed_frames, - left_context=left_context, - right_context=right_context, - ) - - processed_lens = processed_frames + encoder_out_lens - if self.decoding_method == "fast_beam_search_nbest": - next_hyp_list, next_trailing_blank_frames = fast_beam_search_nbest( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - num_paths=self.beam_search_params["num_paths"], - nbest_scale=self.beam_search_params["nbest_scale"], - use_double_scores=True, - temperature=self.beam_search_params["temperature"], - ) - elif self.decoding_method == "fast_beam_search_nbest_LG": - ( - next_hyp_list, - next_trailing_blank_frames, - ) = fast_beam_search_nbest_LG( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - num_paths=self.beam_search_params["num_paths"], - nbest_scale=self.beam_search_params["nbest_scale"], - use_double_scores=True, - temperature=self.beam_search_params["temperature"], - ) - elif self.decoding_method == "fast_beam_search": - ( - next_hyp_list, - next_trailing_blank_frames, - ) = fast_beam_search_one_best( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - ) - else: - raise NotImplementedError( - f"{self.decoding_method} is not implemented" - ) - - next_state_list = [ - torch.unbind(next_states[0], dim=2), - torch.unbind(next_states[1], dim=2), - ] - - for i, s in enumerate(stream_list): - s.states = [next_state_list[0][i], next_state_list[1][i]] - s.processed_frames += encoder_out_lens[i] - s.hyp = next_hyp_list[i] - s.num_trailing_blank_frames = next_trailing_blank_frames[i] - - def get_texts(self, stream: Stream) -> str: - """ - Return text after decoding - Args: - stream: - Stream to be processed. - """ - if self.decoding_method == "fast_beam_search_nbest_LG": - result = [self.word_table[i] for i in stream.hyp] - result = " ".join(result) - elif hasattr(self, "sp"): - result = self.sp.decode(stream.hyp) - else: - result = [self.token_table[i] for i in stream.hyp] - result = "".join(result).replace("▁", " ") - - return result - - -class GreedySearch: - def __init__( - self, - model: "RnntConformerModel", - beam_search_params: dict, - device: torch.device, - ): - """ - Args: - model: - RNN-T model decoder model - beam_search_params: - Dictionary containing all the parameters for beam search. - device: - Device on which the computation will occur - """ - - self.beam_search_params = beam_search_params - self.device = device - - decoder_input = torch.tensor( - [ - [self.beam_search_params["blank_id"]] - * self.beam_search_params["context_size"] - ], - device=self.device, - dtype=torch.int64, - ) - initial_decoder_out = model.decoder_forward(decoder_input) - self.initial_decoder_out = model.forward_decoder_proj( - initial_decoder_out.squeeze(1) - ) - - def init_stream(self, stream: Stream): - """ - Attributes to add to each stream - """ - stream.decoder_out = self.initial_decoder_out - stream.hyp = [ - self.beam_search_params["blank_id"] - ] * self.beam_search_params["context_size"] - - @torch.no_grad() - def process( - self, - server: "StreamingServer", - stream_list: List[Stream], - ) -> None: - """Run the model on the given stream list and do search with greedy_search - method. - Args: - server: - An instance of `StreamingServer`. - stream_list: - A list of streams to be processed. It is changed in-place. - That is, the attribute `states` and `hyp` are - updated in-place. - """ - model = server.model - device = model.device - # Note: chunk_length is in frames before subsampling - chunk_length = server.chunk_length - subsampling_factor = server.subsampling_factor - # Note: chunk_size, left_context and right_context are in frames - # after subsampling - chunk_size = server.decode_chunk_size - left_context = server.decode_left_context - right_context = server.decode_right_context - - batch_size = len(stream_list) - - state_list, feature_list, processed_frames_list = [], [], [] - decoder_out_list, hyp_list = [], [] - - num_trailing_blank_frames_list = [] - - for s in stream_list: - decoder_out_list.append(s.decoder_out) - hyp_list.append(s.hyp) - state_list.append(s.states) - processed_frames_list.append(s.processed_frames) - f = s.features[:chunk_length] - s.features = s.features[chunk_size * subsampling_factor :] - b = torch.cat(f, dim=0) - feature_list.append(b) - - num_trailing_blank_frames_list.append(s.num_trailing_blank_frames) - - features = torch.stack(feature_list, dim=0).to(device) - - states = [ - torch.stack([x[0] for x in state_list], dim=2), - torch.stack([x[1] for x in state_list], dim=2), - ] - - decoder_out = torch.cat(decoder_out_list, dim=0) - - features_length = torch.full( - (batch_size,), - fill_value=features.size(1), - device=device, - dtype=torch.int64, - ) - - processed_frames = torch.tensor(processed_frames_list, device=device) - - ( - encoder_out, - encoder_out_lens, - next_states, - ) = model.encoder_streaming_forward( - features=features, - features_length=features_length, - states=states, - processed_frames=processed_frames, - left_context=left_context, - right_context=right_context, - ) - - # Note: It does not return the next_encoder_out_len since - # there are no paddings for streaming ASR. Each stream - # has the same input number of frames, i.e., server.chunk_length. - ( - next_decoder_out, - next_hyp_list, - next_trailing_blank_frames, - ) = streaming_greedy_search( - model=model, - encoder_out=encoder_out, - decoder_out=decoder_out, - hyps=hyp_list, - num_trailing_blank_frames=num_trailing_blank_frames_list, - ) - - next_state_list = [ - torch.unbind(next_states[0], dim=2), - torch.unbind(next_states[1], dim=2), - ] - next_decoder_out_list = next_decoder_out.split(1) - - for i, s in enumerate(stream_list): - s.states = [next_state_list[0][i], next_state_list[1][i]] - s.processed_frames += encoder_out_lens[i] - s.decoder_out = next_decoder_out_list[i] - s.hyp = next_hyp_list[i] - s.num_trailing_blank_frames = next_trailing_blank_frames[i] - - def get_texts(self, stream: Stream) -> str: - """ - Return text after decoding - Args: - stream: - Stream to be processed. - """ - if hasattr(self, "sp"): - result = self.sp.decode( - stream.hyp[self.beam_search_params["context_size"] :] - ) # noqa - else: - result = [ - self.token_table[i] - for i in stream.hyp[self.beam_search_params["context_size"] :] - ] # noqa - result = "".join(result).replace("▁", " ") - - return result diff --git a/sherpa/bin/streaming_pruned_transducer_statelessX/stream.py b/sherpa/bin/streaming_pruned_transducer_statelessX/stream.py deleted file mode 100644 index 56a7d3a8c..000000000 --- a/sherpa/bin/streaming_pruned_transducer_statelessX/stream.py +++ /dev/null @@ -1,173 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import List - -import torch -from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature - -import sherpa - - -def _create_streaming_feature_extractor() -> OnlineFeature: - """Create a CPU streaming feature extractor. - - At present, we assume it returns a fbank feature extractor with - fixed options. In the future, we will support passing in the options - from outside. - - Returns: - Return a CPU streaming feature extractor. - """ - opts = FbankOptions() - opts.device = "cpu" - opts.frame_opts.dither = 0 - opts.frame_opts.snip_edges = False - opts.frame_opts.samp_freq = 16000 - opts.mel_opts.num_bins = 80 - return OnlineFbank(opts) - - -class Stream(object): - def __init__( - self, - context_size: int, - subsampling_factor: int, - initial_states: List[torch.Tensor], - ) -> None: - """ - Args: - context_size: - Context size of the RNN-T decoder model. - subsampling_factor: - Subsampling factor of the RNN-T encoder model. - initial_states: - The initial states of the Conformer model. Note that the state - does not contain the batch dimension. - """ - self.feature_extractor = _create_streaming_feature_extractor() - # It contains a list of 2-D tensors representing the feature frames. - # Each entry is of shape (1, feature_dim) - self.features: List[torch.Tensor] = [] - self.num_fetched_frames = 0 # before subsampling - - self.states = initial_states - - self.processed_frames = 0 # after subsampling - self.num_trailing_blank_frames = 0 # after subsampling - self.context_size = context_size - self.subsampling_factor = subsampling_factor - self.log_eps = math.log(1e-10) - - # whenever an endpoint is detected, it is incremented - self.segment = 0 - - def accept_waveform( - self, - sampling_rate: float, - waveform: torch.Tensor, - ) -> None: - """Feed audio samples to the feature extractor and compute features - if there are enough samples available. - - Caution: - The range of the audio samples should match the one used in the - training. That is, if you use the range [-1, 1] in the training, then - the input audio samples should also be normalized to [-1, 1]. - - Args - sampling_rate: - The sampling rate of the input audio samples. It is used for sanity - check to ensure that the input sampling rate equals to the one - used in the extractor. If they are not equal, then no resampling - will be performed; instead an error will be thrown. - waveform: - A 1-D torch tensor of dtype torch.float32 containing audio samples. - It should be on CPU. - """ - self.feature_extractor.accept_waveform( - sampling_rate=sampling_rate, - waveform=waveform, - ) - self._fetch_frames() - - def input_finished(self) -> None: - """Signal that no more audio samples available and the feature - extractor should flush the buffered samples to compute frames. - """ - self.feature_extractor.input_finished() - self._fetch_frames() - - def _fetch_frames(self) -> None: - """Fetch frames from the feature extractor""" - while self.num_fetched_frames < self.feature_extractor.num_frames_ready: - frame = self.feature_extractor.get_frame(self.num_fetched_frames) - self.features.append(frame) - self.num_fetched_frames += 1 - - def add_tail_paddings(self, n: int = 20) -> None: - """Add some tail paddings so that we have enough context to process - frames at the very end of an utterance. - - Args: - n: - Number of tail padding frames to be added. You can increase it if - it happens that there are many missing tokens for the last word of - an utterance. - """ - tail_padding = torch.full( - (1, self.feature_extractor.opts.mel_opts.num_bins), - fill_value=self.log_eps, - dtype=torch.float32, - ) - - self.features += [tail_padding] * n - - def endpoint_detected( - self, - config: sherpa.OnlineEndpointConfig, - ) -> bool: - """ - Args: - config: - Config for endpointing. - Returns: - Return True if endpoint is detected; return False otherwise. - """ - frame_shift_in_seconds = ( - self.feature_extractor.opts.frame_opts.frame_shift_ms / 1000 - ) - - trailing_silence_frames = ( - self.num_trailing_blank_frames * self.subsampling_factor - ) - - num_frames_decoded = self.processed_frames * self.subsampling_factor - - detected = sherpa.endpoint_detected( - config=config, - num_frames_decoded=num_frames_decoded, - trailing_silence_frames=trailing_silence_frames, - frame_shift_in_seconds=frame_shift_in_seconds, - ) - - if detected: - self.processed_frames = 0 - self.num_trailing_blank_frames = 0 - self.segment += 1 - - return detected diff --git a/sherpa/bin/streaming_pruned_transducer_statelessX/streaming_client.py b/sherpa/bin/streaming_pruned_transducer_statelessX/streaming_client.py deleted file mode 120000 index 7bb0611a8..000000000 --- a/sherpa/bin/streaming_pruned_transducer_statelessX/streaming_client.py +++ /dev/null @@ -1 +0,0 @@ -../pruned_stateless_emformer_rnnt2/streaming_client.py \ No newline at end of file diff --git a/sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py b/sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py deleted file mode 100755 index 94adaa943..000000000 --- a/sherpa/bin/streaming_pruned_transducer_statelessX/streaming_server.py +++ /dev/null @@ -1,646 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang, -# Wei Kang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A server for streaming ASR recognition. By streaming it means the audio samples -are coming in real-time. You don't need to wait until all audio samples are -captured before sending them for recognition. - -It supports multiple clients sending at the same time. - -Usage: - ./streaming_server.py --help - - ./streaming_server.py - -Please refer to -https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html -for details -""" - -import argparse -import asyncio -import http -import json -import logging -import math -import warnings -from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Tuple - -import k2 -import numpy as np -import sentencepiece as spm -import torch -import websockets -from beam_search import FastBeamSearch, GreedySearch -from stream import Stream - -from sherpa import ( - OnlineEndpointConfig, - RnntConformerModel, - add_beam_search_arguments, - add_online_endpoint_arguments, -) - - -def get_args(): - beam_search_parser = add_beam_search_arguments() - online_endpoint_parser = add_online_endpoint_arguments() - parser = argparse.ArgumentParser( - parents=[beam_search_parser, online_endpoint_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--port", - type=int, - default=6006, - help="The server will listen on this port", - ) - - parser.add_argument( - "--nn-model-filename", - type=str, - required=True, - help="""The torchscript model. You can use - icefall/egs/librispeech/ASR/pruned_transducer_statelessX/export.py \ - --jit=1 - to generate this model. - """, - ) - - parser.add_argument( - "--bpe-model-filename", - type=str, - help="""The BPE model - You can find it in the directory egs/librispeech/ASR/data/lang_bpe_xxx - where xxx is the number of BPE tokens you used to train the model. - Note: You don't need to provide it if you provide `--token-filename`. - """, - ) - - parser.add_argument( - "--token-filename", - type=str, - help="""Filename for tokens.txt - For instance, you can find it in the directory - egs/aishell/ASR/data/lang_char/tokens.txt - or - egs/wenetspeech/ASR/data/lang_char/tokens.txt - from icefall - Note: You don't need to provide it if you provide `--bpe-model` - """, - ) - - parser.add_argument( - "--decode-chunk-size", - type=int, - default=8, - help="The chunk size for decoding (in frames after subsampling)", - ) - - parser.add_argument( - "--decode-left-context", - type=int, - default=32, - help="""left context can be seen during decoding - (in frames after subsampling)""", - ) - - parser.add_argument( - "--decode-right-context", - type=int, - default=2, - help="""right context can be seen during decoding - (in frames after subsampling)""", - ) - - parser.add_argument( - "--nn-pool-size", - type=int, - default=1, - help="Number of threads for NN computation and decoding.", - ) - - parser.add_argument( - "--max-batch-size", - type=int, - default=50, - help="""Max batch size for computation. Note if there are not enough - requests in the queue, it will wait for max_wait_ms time. After that, - even if there are not enough requests, it still sends the - available requests in the queue for computation. - """, - ) - - parser.add_argument( - "--max-wait-ms", - type=float, - default=10, - help="""Max time in millisecond to wait to build batches for inference. - If there are not enough requests in the stream queue to build a batch - of max_batch_size, it waits up to this time before fetching available - requests for computation. - """, - ) - - parser.add_argument( - "--max-message-size", - type=int, - default=(1 << 20), - help="""Max message size in bytes. - The max size per message cannot exceed this limit. - """, - ) - - parser.add_argument( - "--max-queue-size", - type=int, - default=32, - help="Max number of messages in the queue for each connection.", - ) - - parser.add_argument( - "--max-active-connections", - type=int, - default=500, - help="""Maximum number of active connections. The server will refuse - to accept new connections once the current number of active connections - equals to this limit. - """, - ) - - return ( - parser.parse_args(), - beam_search_parser.parse_known_args()[0], - online_endpoint_parser.parse_known_args()[0], - ) - - -class StreamingServer(object): - def __init__( - self, - nn_model_filename: str, - bpe_model_filename: str, - token_filename: str, - decode_chunk_size: int, - decode_left_context: int, - decode_right_context: int, - nn_pool_size: int, - max_wait_ms: float, - max_batch_size: int, - max_message_size: int, - max_queue_size: int, - max_active_connections: int, - beam_search_params: dict, - online_endpoint_config: OnlineEndpointConfig, - ): - """ - Args: - nn_model_filename: - Path to the torchscript model - bpe_model_filename: - Path to the BPE model. If it is None, you have to provide - `token_filename`. - token_filename: - Path to tokens.txt. If it is None, you have to provide - `bpe_model_filename`. - decode_chunk_size: - The chunk size for decoding (in frames after subsampling) - decode_left_context: - The left context for decoding (in frames after subsampling) - decode_right_context: - The right context for decoding (in frames after subsampling) - beam: - The beam for fast_beam_search decoding. - max_states: - The max_states for fast_beam_search decoding. - max_contexts: - The max_contexts for fast_beam_search decoding. - decoding_method: - The decoding method to use, can be either greedy_search - or fast_beam_search. - nn_pool_size: - Number of threads for the thread pool that is responsible for - neural network computation and decoding. - max_wait_ms: - Max wait time in milliseconds in order to build a batch of - `batch_size`. - max_batch_size: - Max batch size for inference. - max_message_size: - Max size in bytes per message. - max_queue_size: - Max number of messages in the queue for each connection. - max_active_connections: - Max number of active connections. Once number of active client - equals to this limit, the server refuses to accept new connections. - beam_search_params: - Dictionary containing all the parameters for beam search. - online_endpoint_config: - Config for endpointing. - """ - if torch.cuda.is_available(): - device = torch.device("cuda", 0) - else: - device = torch.device("cpu") - logging.info(f"Using device: {device}") - - self.model = RnntConformerModel(nn_model_filename, device=device) - - self.subsampling_factor = self.model.subsampling_factor - - # Note: The following 3 attributes are in frames after subsampling. - self.decode_chunk_size = decode_chunk_size - self.decode_left_context = decode_left_context - self.decode_right_context = decode_right_context - - # We add 3 here since the subsampling method is using - # ((len - 1) // 2 - 1) // 2) - # We plus 2 here because we will cut off one frame on each side - # of encoder_embed output (in conformer.py) to avoid a training - # and decoding mismatch by seeing padding values. - # Note: chunk_length is in frames before subsampling. - self.chunk_length = ( - self.decode_chunk_size + 2 + self.decode_right_context - ) * self.subsampling_factor + 3 - - if bpe_model_filename: - self.sp = spm.SentencePieceProcessor() - self.sp.load(bpe_model_filename) - else: - self.token_table = k2.SymbolTable.from_file(token_filename) - - self.context_size = self.model.context_size - self.subsampling_factor = self.model.subsampling_factor - self.blank_id = self.model.blank_id - self.vocab_size = self.model.vocab_size - self.log_eps = math.log(1e-10) - - self.initial_states = self.model.get_encoder_init_states( - self.decode_left_context - ) - - # Add these params after loading the RNN-T model - beam_search_params["vocab_size"] = self.vocab_size - beam_search_params["context_size"] = self.context_size - beam_search_params["blank_id"] = self.blank_id - - decoding_method = beam_search_params["decoding_method"] - if decoding_method.startswith("fast_beam_search"): - self.beam_search = FastBeamSearch( - beam_search_params=beam_search_params, - device=device, - ) - elif decoding_method == "greedy_search": - self.beam_search = GreedySearch( - self.model, - beam_search_params, - device, - ) - else: - raise ValueError( - f"Decoding method {decoding_method} is not supported." - ) - - if bpe_model_filename: - self.beam_search.sp = spm.SentencePieceProcessor() - self.beam_search.sp.load(bpe_model_filename) - else: - self.beam_search.token_table = k2.SymbolTable.from_file( - token_filename - ) - - self.online_endpoint_config = online_endpoint_config - - self.nn_pool = ThreadPoolExecutor( - max_workers=nn_pool_size, - thread_name_prefix="nn", - ) - - self.stream_queue = asyncio.Queue() - self.max_wait_ms = max_wait_ms - self.max_batch_size = max_batch_size - self.max_message_size = max_message_size - self.max_queue_size = max_queue_size - self.max_active_connections = max_active_connections - - self.current_active_connections = 0 - - async def warmup(self) -> None: - """Do warmup to the torchscript model to decrease the waiting time - of the first request. - - See https://github.com/k2-fsa/sherpa/pull/100 for details - """ - logging.info("Warmup start") - stream = Stream( - context_size=self.context_size, - subsampling_factor=self.subsampling_factor, - initial_states=self.initial_states, - ) - self.beam_search.init_stream(stream) - - samples = torch.rand(16000 * 1, dtype=torch.float32) # 1 second - stream.accept_waveform(sampling_rate=16000, waveform=samples) - - while len(stream.features) > self.chunk_length: - await self.compute_and_decode(stream) - - logging.info("Warmup done") - - async def stream_consumer_task(self): - """This function extracts streams from the queue, batches them up, sends - them to the RNN-T model for computation and decoding. - """ - while True: - if self.stream_queue.empty(): - await asyncio.sleep(self.max_wait_ms / 1000) - continue - - batch = [] - try: - while len(batch) < self.max_batch_size: - item = self.stream_queue.get_nowait() - - assert len(item[0].features) >= self.chunk_length, len( - item[0].features - ) - - batch.append(item) - except asyncio.QueueEmpty: - pass - stream_list = [b[0] for b in batch] - future_list = [b[1] for b in batch] - - loop = asyncio.get_running_loop() - await loop.run_in_executor( - self.nn_pool, - self.beam_search.process, - self, - stream_list, - ) - - for f in future_list: - self.stream_queue.task_done() - f.set_result(None) - - async def compute_and_decode( - self, - stream: Stream, - ) -> None: - """Put the stream into the queue and wait it to be processed by the - consumer task. - - Args: - stream: - The stream to be processed. Note: It is changed in-place. - """ - loop = asyncio.get_running_loop() - future = loop.create_future() - await self.stream_queue.put((stream, future)) - await future - - async def process_request( - self, - unused_path: str, - unused_request_headers: websockets.Headers, - ) -> Optional[Tuple[http.HTTPStatus, websockets.Headers, bytes]]: - if self.current_active_connections < self.max_active_connections: - self.current_active_connections += 1 - return None - - # Refuse new connections - status = http.HTTPStatus.SERVICE_UNAVAILABLE # 503 - header = {"Hint": "The server is overloaded. Please retry later."} - response = b"The server is busy. Please retry later." - - return status, header, response - - async def run(self, port: int): - task = asyncio.create_task(self.stream_consumer_task()) - await self.warmup() - - async with websockets.serve( - self.handle_connection, - host="", - port=port, - max_size=self.max_message_size, - max_queue=self.max_queue_size, - process_request=self.process_request, - ): - await asyncio.Future() # run forever - - await task # not reachable - - async def handle_connection( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and send - decoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - try: - await self.handle_connection_impl(socket) - finally: - # Decrement so that it can accept new connections - self.current_active_connections -= 1 - - logging.info( - f"Disconnected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - - async def handle_connection_impl( - self, - socket: websockets.WebSocketServerProtocol, - ): - """Receive audio samples from the client, process it, and send - deocoding result back to the client. - - Args: - socket: - The socket for communicating with the client. - """ - logging.info( - f"Connected: {socket.remote_address}. " - f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa - ) - stream = Stream( - context_size=self.context_size, - subsampling_factor=self.subsampling_factor, - initial_states=self.initial_states, - ) - - self.beam_search.init_stream(stream) - - while True: - samples = await self.recv_audio_samples(socket) - if samples is None: - break - - # TODO(fangjun): At present, we assume the sampling rate - # of the received audio samples is always 16000. - stream.accept_waveform(sampling_rate=16000, waveform=samples) - - while len(stream.features) > self.chunk_length: - await self.compute_and_decode(stream) - hyp = self.beam_search.get_texts(stream) - - segment = stream.segment - is_final = stream.endpoint_detected(self.online_endpoint_config) - if is_final: - self.beam_search.init_stream(stream) - - message = { - "segment": segment, - "text": hyp, - "final": is_final, - } - - await socket.send(json.dumps(message)) - - stream.input_finished() - while len(stream.features) > self.chunk_length: - await self.compute_and_decode(stream) - - if len(stream.features) > 0: - n = self.chunk_length - len(stream.features) - stream.add_tail_paddings(n) - await self.compute_and_decode(stream) - stream.features = [] - - hyp = self.beam_search.get_texts(stream) - - message = { - "segment": stream.segment, - "text": hyp, - "final": True, # end of connection, always set final to True - } - - await socket.send(json.dumps(message)) - - async def recv_audio_samples( - self, - socket: websockets.WebSocketServerProtocol, - ) -> Optional[torch.Tensor]: - """Receives a tensor from the client. - - Each message contains either a bytes buffer containing audio samples - in 16 kHz or contains b"Done" meaning the end of utterance. - - Args: - socket: - The socket for communicating with the client. - Returns: - Return a 1-D torch.float32 tensor containing the audio samples or - return None. - """ - message = await socket.recv() - if message == b"Done": - return None - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # PyTorch warns that the underlying buffer is not writable. - # We ignore it here as we are not going to write it anyway. - if hasattr(torch, "frombuffer"): - # Note: torch.frombuffer is available only in torch>= 1.10 - return torch.frombuffer(message, dtype=torch.float32) - else: - array = np.frombuffer(message, dtype=np.float32) - return torch.from_numpy(array) - - -@torch.no_grad() -def main(): - args, beam_search_parser, online_endpoint_parser = get_args() - - beam_search_params = vars(beam_search_parser) - logging.info(beam_search_params) - - online_endpoint_params = vars(online_endpoint_parser) - logging.info(online_endpoint_params) - - online_endpoint_config = OnlineEndpointConfig.from_args( - online_endpoint_params - ) - - logging.info(vars(args)) - - port = args.port - nn_model_filename = args.nn_model_filename - bpe_model_filename = args.bpe_model_filename - token_filename = args.token_filename - decode_chunk_size = args.decode_chunk_size - decode_left_context = args.decode_left_context - decode_right_context = args.decode_right_context - nn_pool_size = args.nn_pool_size - max_batch_size = args.max_batch_size - max_wait_ms = args.max_wait_ms - max_message_size = args.max_message_size - max_queue_size = args.max_queue_size - max_active_connections = args.max_active_connections - - server = StreamingServer( - nn_model_filename=nn_model_filename, - bpe_model_filename=bpe_model_filename, - token_filename=token_filename, - decode_chunk_size=decode_chunk_size, - decode_left_context=decode_left_context, - decode_right_context=decode_right_context, - nn_pool_size=nn_pool_size, - max_batch_size=max_batch_size, - max_wait_ms=max_wait_ms, - max_message_size=max_message_size, - max_queue_size=max_queue_size, - max_active_connections=max_active_connections, - beam_search_params=beam_search_params, - online_endpoint_config=online_endpoint_config, - ) - asyncio.run(server.run(port)) - - -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - -# See https://github.com/pytorch/pytorch/issues/38342 -# and https://github.com/pytorch/pytorch/issues/33354 -# -# If we don't do this, the delay increases whenever there is -# a new request that changes the actual batch size. -# If you use `py-spy dump --pid --native`, you will -# see a lot of time is spent in re-compiling the torch script model. -torch._C._jit_set_profiling_executor(False) -torch._C._jit_set_profiling_mode(False) -torch._C._set_graph_executor_optimize(False) -""" -// Use the following in C++ -torch::jit::getExecutorMode() = false; -torch::jit::getProfilingMode() = false; -torch::jit::setGraphExecutorOptimize(false); -""" - -if __name__ == "__main__": - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" # noqa - logging.basicConfig(format=formatter, level=logging.INFO) - main() diff --git a/sherpa/bin/streaming_server.py b/sherpa/bin/streaming_server.py new file mode 100755 index 000000000..b13b8801f --- /dev/null +++ b/sherpa/bin/streaming_server.py @@ -0,0 +1,870 @@ +#!/usr/bin/env python3 +# Copyright 2022-2023 Xiaomi Corp. +# +""" +A server for streaming ASR. By streaming it means the audio samples +are coming in real-time. You don't need to wait until all audio samples are +captured before sending them for recognition. + +It supports multiple clients sending at the same time. + +Usage: + ./streaming_server.py --help + +The following example demonstrates the usage of this file with a pre-trained +streaming zipformer model for English. You can also use other streaming +models from icefall for testing. + +See below for the usage of an example client after starting the server. + +(1) Download the pre-trained model + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + +cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + +git lfs pull --include "exp/cpu_jit.pt" +git lfs pull --include "data/lang_bpe_500/LG.pt" + +(2) greedy_search + +cd /path/to/sherpa + +python3 ./sherpa/bin/streaming_server.py \ + --port=6006 + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \ + +(3) modified_beam_search + +cd /path/to/sherpa + +python3 ./sherpa/bin/streaming_server.py \ + --port=6006 \ + --decoding-method=modified_beam_search \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt + +(4) fast_beam_search + +cd /path/to/sherpa + +python3 ./sherpa/bin/streaming_server.py \ + --port=6006 \ + --decoding-method=fast_beam_search \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt + +(5) fast_beam_search with LG + +cd /path/to/sherpa + +python3 ./sherpa/bin/streaming_server.py \ + --port=6006 \ + --decoding-method=fast_beam_search \ + --LG=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/LG.pt \ + --nn-model=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt \ + --tokens=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt + +------------------------------------------------------------ + +After starting the server, you can start the example client with the following commands: + +cd /path/to/sherpa + +python3 ./sherpa/bin/streaming_client.py \ + --server-port 6006 \ + ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav +""" # noqa + +import argparse +import asyncio +import http +import json +import logging +import socket +import ssl +import sys +import warnings +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import List, Optional, Tuple + +import numpy as np +import torch +import websockets + +import sherpa + + +def add_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--nn-model", + type=str, + help="""The torchscript model. Please refer to + https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html + for a list of pre-trained models to download. + + Not needed if you provide --encoder-model, --decoder-model, and + --joiner-model + """, + ) + + parser.add_argument( + "--encoder-model", + type=str, + help="Path to the encoder model. Not used if you provide --nn-model", + ) + + parser.add_argument( + "--decoder-model", + type=str, + help="Path to the decoder model. Not used if you provide --nn-model", + ) + + parser.add_argument( + "--joiner-model", + type=str, + help="Path to the joiner model. Not used if you provide --nn-model", + ) + + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--sample-rate", + type=int, + default=16000, + help="Sample rate of the data used to train the model. ", + ) + + parser.add_argument( + "--feat-dim", + type=int, + default=80, + help="Feature dimension of the model", + ) + + parser.add_argument( + "--use-bbpe", + type=sherpa.str2bool, + default=False, + help="Whether the model to be used is trained with bbpe", + ) + + +def add_decoding_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="""Decoding method to use. Current supported methods are: + - greedy_search + - modified_beam_search + - fast_beam_search + """, + ) + + add_modified_beam_search_args(parser) + add_fast_beam_search_args(parser) + + +def add_modified_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--num-active-paths", + type=int, + default=4, + help="""Used only when --decoding-method is modified_beam_search. + It specifies number of active paths to keep during decoding. + """, + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help="""Used only when --decoding-method is modified_beam_search. + It specifies the softmax temperature. + """, + ) + + +def add_endpointing_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--use-endpoint", + type=sherpa.str2bool, + default=True, + help="True to enable endpoiting. False to disable it", + ) + + parser.add_argument( + "--rule1-min-trailing-silence", + type=float, + default=2.4, + help="""This endpointing rule1 requires duration of trailing silence + in seconds) to be >= this value""", + ) + + parser.add_argument( + "--rule2-min-trailing-silence", + type=float, + default=1.2, + help="""This endpointing rule2 requires duration of trailing silence in + seconds) to be >= this value.""", + ) + + parser.add_argument( + "--rule3-min-utterance-length", + type=float, + default=20, + help="""This endpointing rule3 requires utterance-length (in seconds) + to be >= this value.""", + ) + + +def add_fast_beam_search_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--max-contexts", + type=int, + default=8, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--max-states", + type=int, + default=64, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--allow-partial", + type=sherpa.str2bool, + default=True, + help="Used only when --decoding-method is fast_beam_search", + ) + + parser.add_argument( + "--LG", + type=str, + default="", + help="""Used only when --decoding-method is fast_beam_search. + If not empty, it points to LG.pt. + """, + ) + + parser.add_argument( + "--ngram-lm-scale", + type=float, + default=0.01, + help=""" + Used only when --decoding_method is fast_beam_search and + --LG is not empty. + """, + ) + + parser.add_argument( + "--beam", + type=float, + default=4, + help="""A floating point value to calculate the cutoff score during beam + search (i.e., `cutoff = max-score - beam`), which is the same as the + `beam` in Kaldi. + Used only when --method is fast_beam_search""", + ) + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + add_model_args(parser) + add_decoding_args(parser) + add_endpointing_args(parser) + + parser.add_argument( + "--port", + type=int, + default=6006, + help="The server will listen on this port", + ) + + parser.add_argument( + "--nn-pool-size", + type=int, + default=1, + help="Number of threads for NN computation and decoding.", + ) + + parser.add_argument( + "--max-batch-size", + type=int, + default=50, + help="""Max batch size for computation. Note if there are not enough + requests in the queue, it will wait for max_wait_ms time. After that, + even if there are not enough requests, it still sends the + available requests in the queue for computation. + """, + ) + + parser.add_argument( + "--max-wait-ms", + type=float, + default=10, + help="""Max time in millisecond to wait to build batches for inference. + If there are not enough requests in the stream queue to build a batch + of max_batch_size, it waits up to this time before fetching available + requests for computation. + """, + ) + + parser.add_argument( + "--max-message-size", + type=int, + default=(1 << 20), + help="""Max message size in bytes. + The max size per message cannot exceed this limit. + """, + ) + + parser.add_argument( + "--max-queue-size", + type=int, + default=32, + help="Max number of messages in the queue for each connection.", + ) + + parser.add_argument( + "--max-active-connections", + type=int, + default=500, + help="""Maximum number of active connections. The server will refuse + to accept new connections once the current number of active connections + equals to this limit. + """, + ) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="""Sets the number of threads used for interop parallelism + (e.g. in JIT interpreter) on CPU.""", + ) + + parser.add_argument( + "--use-gpu", + type=sherpa.str2bool, + default=False, + help="""True to use GPU. It always selects GPU 0. You can use the + environement variable CUDA_VISIBLE_DEVICES to control which GPU + is mapped to GPU 0. + """, + ) + + parser.add_argument( + "--certificate", + type=str, + help="""Path to the X.509 certificate. You need it only if you want to + use a secure websocket connection, i.e., use wss:// instead of ws://. + You can use sherpa/bin/web/generate-certificate.py + to generate the certificate `cert.pem`. + """, + ) + + parser.add_argument( + "--doc-root", + type=str, + default="./sherpa/bin/web", + help="""Path to the web root""", + ) + + parser.add_argument( + "--tail-padding-length", + type=float, + default=1.0, + help="Length of the tail padding in seconds", + ) + + return parser.parse_args() + + +def create_recognizer(args) -> sherpa.OnlineRecognizer: + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = args.sample_rate + feat_config.fbank_opts.mel_opts.num_bins = args.feat_dim + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + fast_beam_search_config = sherpa.FastBeamSearchConfig( + lg=args.LG if args.LG else "", + ngram_lm_scale=args.ngram_lm_scale, + beam=args.beam, + max_states=args.max_states, + max_contexts=args.max_contexts, + allow_partial=args.allow_partial, + ) + + endpoint_config = sherpa.EndpointConfig() + endpoint_config.rule1.min_trailing_silence = args.rule1_min_trailing_silence + endpoint_config.rule2.min_trailing_silence = args.rule2_min_trailing_silence + endpoint_config.rule3.min_utterance_length = args.rule3_min_utterance_length + + if args.use_gpu and not torch.cuda.is_available(): + sys.exit("no CUDA devices available but you set --use-gpu=true") + + config = sherpa.OnlineRecognizerConfig( + nn_model=args.nn_model, + encoder_model=args.encoder_model, + decoder_model=args.decoder_model, + joiner_model=args.joiner_model, + tokens=args.tokens, + use_gpu=args.use_gpu, + num_active_paths=args.num_active_paths, + use_bbpe=args.use_bbpe, + temperature=args.temperature, + feat_config=feat_config, + decoding_method=args.decoding_method, + fast_beam_search_config=fast_beam_search_config, + use_endpoint=args.use_endpoint, + endpoint_config=endpoint_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + return recognizer + + +def format_timestamps(timestamps: List[float]) -> List[str]: + return ["{:.3f}".format(t) for t in timestamps] + + +class StreamingServer(object): + def __init__( + self, + recognizer: sherpa.OnlineRecognizer, + nn_pool_size: int, + max_wait_ms: float, + max_batch_size: int, + max_message_size: int, + max_queue_size: int, + max_active_connections: int, + doc_root: str, + tail_padding_length: float, + certificate: Optional[str] = None, + ): + """ + Args: + recognizer: + An instance of online recognizer. + nn_pool_size: + Number of threads for the thread pool that is responsible for + neural network computation and decoding. + max_wait_ms: + Max wait time in milliseconds in order to build a batch of + `batch_size`. + max_batch_size: + Max batch size for inference. + max_message_size: + Max size in bytes per message. + max_queue_size: + Max number of messages in the queue for each connection. + max_active_connections: + Max number of active connections. Once number of active client + equals to this limit, the server refuses to accept new connections. + beam_search_params: + Dictionary containing all the parameters for beam search. + online_endpoint_config: + Config for endpointing. + doc_root: + Path to the directory where files like index.html for the HTTP + server locate. + certificate: + Optional. If not None, it will use secure websocket. + You can use ./sherpa/bin/web/generate-certificate.py to generate + it (the default generated filename is `cert.pem`). + """ + self.recognizer = recognizer + + self.certificate = certificate + self.http_server = sherpa.HttpServer(doc_root) + + self.nn_pool = ThreadPoolExecutor( + max_workers=nn_pool_size, + thread_name_prefix="nn", + ) + + self.stream_queue = asyncio.Queue() + self.max_wait_ms = max_wait_ms + self.max_batch_size = max_batch_size + self.max_message_size = max_message_size + self.max_queue_size = max_queue_size + self.max_active_connections = max_active_connections + + self.current_active_connections = 0 + + self.sample_rate = int( + recognizer.config.feat_config.fbank_opts.frame_opts.samp_freq + ) + self.decoding_method = recognizer.config.decoding_method + self.tail_padding_length = tail_padding_length + + async def stream_consumer_task(self): + """This function extracts streams from the queue, batches them up, sends + them to the RNN-T model for computation and decoding. + """ + while True: + if self.stream_queue.empty(): + await asyncio.sleep(self.max_wait_ms / 1000) + continue + + batch = [] + try: + while len(batch) < self.max_batch_size: + item = self.stream_queue.get_nowait() + + assert self.recognizer.is_ready(item[0]) + + batch.append(item) + except asyncio.QueueEmpty: + pass + stream_list = [b[0] for b in batch] + future_list = [b[1] for b in batch] + + loop = asyncio.get_running_loop() + await loop.run_in_executor( + self.nn_pool, + self.recognizer.decode_streams, + stream_list, + ) + + for f in future_list: + self.stream_queue.task_done() + f.set_result(None) + + async def compute_and_decode( + self, + stream: sherpa.OnlineStream, + ) -> None: + """Put the stream into the queue and wait it to be processed by the + consumer task. + + Args: + stream: + The stream to be processed. Note: It is changed in-place. + """ + loop = asyncio.get_running_loop() + future = loop.create_future() + await self.stream_queue.put((stream, future)) + await future + + async def process_request( + self, + path: str, + request_headers: websockets.Headers, + ) -> Optional[Tuple[http.HTTPStatus, websockets.Headers, bytes]]: + if "sec-websocket-key" not in request_headers: + # This is a normal HTTP request + if path == "/": + path = "/index.html" + found, response, mime_type = self.http_server.process_request(path) + if isinstance(response, str): + response = response.encode("utf-8") + + if not found: + status = http.HTTPStatus.NOT_FOUND + else: + status = http.HTTPStatus.OK + header = {"Content-Type": mime_type} + return status, header, response + + if self.current_active_connections < self.max_active_connections: + self.current_active_connections += 1 + return None + + # Refuse new connections + status = http.HTTPStatus.SERVICE_UNAVAILABLE # 503 + header = {"Hint": "The server is overloaded. Please retry later."} + response = b"The server is busy. Please retry later." + + return status, header, response + + async def run(self, port: int): + task = asyncio.create_task(self.stream_consumer_task()) + + if self.certificate: + logging.info(f"Using certificate: {self.certificate}") + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ssl_context.load_cert_chain(self.certificate) + else: + ssl_context = None + logging.info("No certificate provided") + + async with websockets.serve( + self.handle_connection, + host="", + port=port, + max_size=self.max_message_size, + max_queue=self.max_queue_size, + process_request=self.process_request, + ssl=ssl_context, + ): + ip_list = ["0.0.0.0", "localhost", "127.0.0.1"] + ip_list.append(socket.gethostbyname(socket.gethostname())) + proto = "http://" if ssl_context is None else "https://" + s = "Please visit one of the following addresses:\n\n" + for p in ip_list: + s += " " + proto + p + f":{port}" "\n" + logging.info(s) + + await asyncio.Future() # run forever + + await task # not reachable + + async def handle_connection( + self, + socket: websockets.WebSocketServerProtocol, + ): + """Receive audio samples from the client, process it, and send + decoding result back to the client. + + Args: + socket: + The socket for communicating with the client. + """ + try: + await self.handle_connection_impl(socket) + except websockets.exceptions.ConnectionClosedError: + logging.info(f"{socket.remote_address} disconnected") + finally: + # Decrement so that it can accept new connections + self.current_active_connections -= 1 + + logging.info( + f"Disconnected: {socket.remote_address}. " + f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa + ) + + async def handle_connection_impl( + self, + socket: websockets.WebSocketServerProtocol, + ): + """Receive audio samples from the client, process it, and send + deocoding result back to the client. + + Args: + socket: + The socket for communicating with the client. + """ + logging.info( + f"Connected: {socket.remote_address}. " + f"Number of connections: {self.current_active_connections}/{self.max_active_connections}" # noqa + ) + + stream = self.recognizer.create_stream() + + while True: + samples = await self.recv_audio_samples(socket) + if samples is None: + break + + # TODO(fangjun): At present, we assume the sampling rate + # of the received audio samples equal to --sample-rate + stream.accept_waveform( + sampling_rate=self.sample_rate, waveform=samples + ) + + while self.recognizer.is_ready(stream): + await self.compute_and_decode(stream) + result = self.recognizer.get_result(stream) + + message = { + "method": self.decoding_method, + "segment": result.segment, + "text": result.text, + "tokens": result.tokens, + "timestamps": format_timestamps(result.timestamps), + "final": result.is_final, + } + print(message) + + await socket.send(json.dumps(message)) + + tail_padding = torch.rand( + int(self.sample_rate * self.tail_padding_length), dtype=torch.float32 + ) + stream.accept_waveform( + sampling_rate=self.sample_rate, waveform=tail_padding + ) + stream.input_finished() + while self.recognizer.is_ready(stream): + await self.compute_and_decode(stream) + + result = self.recognizer.get_result(stream) + + message = { + "method": self.decoding_method, + "segment": result.segment, + "text": result.text, + "tokens": result.tokens, + "timestamps": format_timestamps(result.timestamps), + "final": True, # end of connection, always set final to True + } + + await socket.send(json.dumps(message)) + + async def recv_audio_samples( + self, + socket: websockets.WebSocketServerProtocol, + ) -> Optional[torch.Tensor]: + """Receives a tensor from the client. + + Each message contains either a bytes buffer containing audio samples + in 16 kHz or contains "Done" meaning the end of utterance. + + Args: + socket: + The socket for communicating with the client. + Returns: + Return a 1-D torch.float32 tensor containing the audio samples or + return None. + """ + message = await socket.recv() + if message == "Done": + return None + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # PyTorch warns that the underlying buffer is not writable. + # We ignore it here as we are not going to write it anyway. + if hasattr(torch, "frombuffer"): + # Note: torch.frombuffer is available only in torch>= 1.10 + return torch.frombuffer(message, dtype=torch.float32) + else: + array = np.frombuffer(message, dtype=np.float32) + return torch.from_numpy(array) + + +def check_args(args): + if args.nn_model is None and args.encoder_model is None: + raise ValueError("Please provide --nn-model or --encoder-model") + + if args.nn_model is not None and args.encoder_model is not None: + raise ValueError("--nn-model and --encoder-model are mutual exclusive") + + if args.nn_model is not None: + assert Path(args.nn_model).is_file(), f"{args.nn_model} does not exist" + args.encoder_model = "" + args.decoder_model = "" + args.joiner_model = "" + else: + assert args.encoder_model is not None + assert args.decoder_model is not None + assert args.joiner_model is not None + + args.nn_model = "" + + assert Path( + args.encoder_model + ).is_file(), f"{args.encoder_model} does not exist" + + assert Path( + args.decoder_model + ).is_file(), f"{args.decoder_model} does not exist" + + assert Path( + args.joiner_model + ).is_file(), f"{args.joiner_model} does not exist" + + if not Path(args.tokens).is_file(): + raise ValueError(f"{args.tokens} does not exist") + + if args.decoding_method not in ( + "greedy_search", + "modified_beam_search", + "fast_beam_search", + ): + raise ValueError(f"Unsupported decoding method {args.decoding_method}") + + if args.decoding_method == "modified_beam_search": + assert args.num_active_paths > 0, args.num_active_paths + assert args.temperature > 0, args.temperature + + if args.decoding_method == "fast_beam_search" and args.LG: + if not Path(args.LG).is_file(): + raise ValueError(f"{args.LG} does not exist") + + +@torch.no_grad() +def main(): + args = get_args() + logging.info(vars(args)) + check_args(args) + + torch.set_num_threads(args.num_threads) + torch.set_num_interop_threads(args.num_threads) + recognizer = create_recognizer(args) + + port = args.port + nn_pool_size = args.nn_pool_size + max_batch_size = args.max_batch_size + max_wait_ms = args.max_wait_ms + max_message_size = args.max_message_size + max_queue_size = args.max_queue_size + max_active_connections = args.max_active_connections + certificate = args.certificate + doc_root = args.doc_root + tail_padding_length = args.tail_padding_length + + if certificate and not Path(certificate).is_file(): + raise ValueError(f"{certificate} does not exist") + + if not Path(doc_root).is_dir(): + raise ValueError(f"Directory {doc_root} does not exist") + + server = StreamingServer( + recognizer=recognizer, + nn_pool_size=nn_pool_size, + max_batch_size=max_batch_size, + max_wait_ms=max_wait_ms, + max_message_size=max_message_size, + max_queue_size=max_queue_size, + max_active_connections=max_active_connections, + certificate=certificate, + doc_root=doc_root, + tail_padding_length=tail_padding_length + ) + asyncio.run(server.run(port)) + + +# See https://github.com/pytorch/pytorch/issues/38342 +# and https://github.com/pytorch/pytorch/issues/33354 +# +# If we don't do this, the delay increases whenever there is +# a new request that changes the actual batch size. +# If you use `py-spy dump --pid --native`, you will +# see a lot of time is spent in re-compiling the torch script model. +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +""" +// Use the following in C++ +torch::jit::getExecutorMode() = false; +torch::jit::getProfilingMode() = false; +torch::jit::setGraphExecutorOptimize(false); +""" + +if __name__ == "__main__": + log_filename = "log/log-streaming-server" + sherpa.setup_logger(log_filename) + main() +else: + torch.set_num_threads(1) + torch.set_num_interop_threads(1) diff --git a/sherpa/bin/web/.gitignore b/sherpa/bin/web/.gitignore new file mode 100644 index 000000000..c88df6c84 --- /dev/null +++ b/sherpa/bin/web/.gitignore @@ -0,0 +1,3 @@ +*.pem +*.key +*.crt diff --git a/sherpa/bin/web/generate-certificate.py b/sherpa/bin/web/generate-certificate.py new file mode 100755 index 000000000..e1364ee75 --- /dev/null +++ b/sherpa/bin/web/generate-certificate.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +""" +pip install pyopenssl +""" + +from OpenSSL import crypto + +# The code in this file is modified from +# https://stackoverflow.com/questions/27164354/create-a-self-signed-x509-certificate-in-python + +""" +This script generates 3 files: + - private.key + - selfsigned.crt + - cert.pem + +You need cert.pem when you start a https server +or a secure websocket server. + +Note: You need to change serialNumber if you want to generate +a new certificate as two different certificates cannot share +the same serial number if they are issued by the same organization. + +Otherwise, you may get the following error from within you browser: + + An error occurred during a connection to 127.0.0.1:6007. You have received an + invalid certificate. Please contact the server administrator or email + correspondent and give them the following information: Your certificate + contains the same serial number as another certificate issued by the + certificate authority. Please get a new certificate containing a unique + serial number. Error code: SEC_ERROR_REUSED_ISSUER_AND_SERIAL + +""" + + +def cert_gen( + emailAddress="https://github.com/k2-fsa/k2", + commonName="sherpa", + countryName="CN", + localityName="k2-fsa", + stateOrProvinceName="k2-fsa", + organizationName="k2-fsa", + organizationUnitName="k2-fsa", + serialNumber=3, + validityStartInSeconds=0, + validityEndInSeconds=10 * 365 * 24 * 60 * 60, + KEY_FILE="private.key", + CERT_FILE="selfsigned.crt", + ALL_IN_ONE_FILE="cert.pem", +): + # can look at generated file using openssl: + # openssl x509 -inform pem -in selfsigned.crt -noout -text + # create a key pair + k = crypto.PKey() + k.generate_key(crypto.TYPE_RSA, 4096) + # create a self-signed cert + cert = crypto.X509() + cert.get_subject().C = countryName + cert.get_subject().ST = stateOrProvinceName + cert.get_subject().L = localityName + cert.get_subject().O = organizationName # noqa + cert.get_subject().OU = organizationUnitName + cert.get_subject().CN = commonName + cert.get_subject().emailAddress = emailAddress + cert.set_serial_number(serialNumber) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(validityEndInSeconds) + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(k) + cert.sign(k, "sha512") + with open(CERT_FILE, "wt") as f: + f.write( + crypto.dump_certificate(crypto.FILETYPE_PEM, cert).decode("utf-8") + ) + with open(KEY_FILE, "wt") as f: + f.write(crypto.dump_privatekey(crypto.FILETYPE_PEM, k).decode("utf-8")) + + with open(ALL_IN_ONE_FILE, "wt") as f: + f.write(crypto.dump_privatekey(crypto.FILETYPE_PEM, k).decode("utf-8")) + f.write( + crypto.dump_certificate(crypto.FILETYPE_PEM, cert).decode("utf-8") + ) + print(f"Generated {CERT_FILE}") + print(f"Generated {KEY_FILE}") + print(f"Generated {ALL_IN_ONE_FILE}") + + +cert_gen() diff --git a/sherpa/bin/web/index.html b/sherpa/bin/web/index.html index 600c8260e..4e08cc6cf 100644 --- a/sherpa/bin/web/index.html +++ b/sherpa/bin/web/index.html @@ -11,6 +11,9 @@ integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"> + @@ -18,7 +21,7 @@ - + - + - \ No newline at end of file + diff --git a/sherpa/bin/web/start-https-server.py b/sherpa/bin/web/start-https-server.py new file mode 100755 index 000000000..b85edd698 --- /dev/null +++ b/sherpa/bin/web/start-https-server.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# Code in this file is modified from +# https://stackoverflow.com/questions/19705785/python-3-simple-https-server + +import argparse +import http.server +import ssl +import sys +from pathlib import Path + +""" +Usage: + + ./start-https-server.py \ + --server-address 0.0.0.0 \ + --server-port 6007 \ + --cert ./cert.pem +""" + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--server-address", + type=str, + default="0.0.0.0", + help="""IP address which this server will bind to""", + ) + + parser.add_argument( + "--server-port", + type=int, + default=6007, + help="""Port number on which this server will listen""", + ) + + parser.add_argument( + "--certificate", + type=str, + default="cert.pem", + help="""Path to the X.509 certificate. You can use + ./generate-certificate.py to generate it""", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + print(f"{vars(args)}") + server_address = (args.server_address, args.server_port) + httpd = http.server.HTTPServer( + server_address, http.server.SimpleHTTPRequestHandler + ) + + if not Path(args.certificate).is_file(): + print("Please run ./generate-certificate.py to generate a certificate") + sys.exit(-1) + + httpd.socket = ssl.wrap_socket( + httpd.socket, + server_side=True, + certfile=args.certificate, + ssl_version=ssl.PROTOCOL_TLS, + ) + print( + "The server is listening at the following address:\n" + f"https://{args.server_address}:{args.server_port}" + ) + httpd.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/sherpa/bin/web/streaming_record.html b/sherpa/bin/web/streaming_record.html index 003d2d036..b31fee68c 100644 --- a/sherpa/bin/web/streaming_record.html +++ b/sherpa/bin/web/streaming_record.html @@ -18,7 +18,7 @@ - + - +

Recognition from a selected file

+
+
+ +
+ ws:// + + : + +
+
diff --git a/sherpa/cpp_api/CMakeLists.txt b/sherpa/cpp_api/CMakeLists.txt index a9bbf064c..bb112e966 100644 --- a/sherpa/cpp_api/CMakeLists.txt +++ b/sherpa/cpp_api/CMakeLists.txt @@ -1,37 +1,63 @@ -add_library(offline_recognizer offline_recognizer.cc) -target_link_libraries(offline_recognizer sherpa_core) -set_target_properties(offline_recognizer PROPERTIES OUTPUT_NAME "sherpa_offline_recognizer") +set(sherpa_cpp_api_srcs + endpoint.cc + fast-beam-search-config.cc + feature-config.cc + offline-recognizer.cc + online-recognizer.cc +) +add_library(sherpa_cpp_api ${sherpa_cpp_api_srcs}) +target_link_libraries(sherpa_cpp_api sherpa_core) -add_executable(test_decode_files test_decode_files.cc) -target_link_libraries(test_decode_files offline_recognizer) +if(UNIX AND NOT APPLE) + target_link_libraries(sherpa_cpp_api -pthread) +endif() +if(SHERPA_ENABLE_TESTS) + add_executable(test-feature-config test-feature-config.cc) + target_link_libraries(test-feature-config sherpa_cpp_api) -# We use kaldi_native_io to read *.wav files, so we link to ${KALDI_NATIVE_IO_LIBRARIES} below -add_executable(test_decode_samples test_decode_samples.cc) -target_link_libraries(test_decode_samples offline_recognizer ${KALDI_NATIVE_IO_LIBRARIES}) + add_executable(test-offline-stream test-offline-stream.cc) + target_link_libraries(test-offline-stream sherpa_cpp_api) +endif() -# We use kaldi_native_io to read *.wav files, so we link to ${KALDI_NATIVE_IO_LIBRARIES} below -# Also, we use kaldifeat to compute fbank features, so we link to ${KALDIFEAT_LIBRARIES} below -add_executable(test_decode_features test_decode_features.cc) -target_link_libraries(test_decode_features - offline_recognizer - ${KALDI_NATIVE_IO_LIBRARIES} - ${KALDIFEAT_LIBRARIES} +file(MAKE_DIRECTORY + ${PROJECT_BINARY_DIR}/include/sherpa/cpp_api ) -file(MAKE_DIRECTORY +set(hdrs + feature-config.h + offline-recognizer.h + offline-stream.h + online-recognizer.h + online-stream.h + parse-options.h +) + +file(COPY + ${hdrs} DESTINATION ${PROJECT_BINARY_DIR}/include/sherpa/cpp_api ) -install(FILES ./offline_recognizer.h +install(FILES ${hdrs} DESTINATION include/sherpa/cpp_api ) -install(FILES ./offline_recognizer.h +install(FILES ${hdrs} DESTINATION ${PROJECT_BINARY_DIR}/include/sherpa/cpp_api ) -install(TARGETS offline_recognizer +install( + TARGETS sherpa_cpp_api DESTINATION lib ) + +add_subdirectory(bin) + +if(SHERPA_ENABLE_WEBSOCKET) + add_subdirectory(websocket) +endif() + +if(SHERPA_ENABLE_GRPC) + add_subdirectory(grpc) +endif() diff --git a/sherpa/cpp_api/bin/CMakeLists.txt b/sherpa/cpp_api/bin/CMakeLists.txt new file mode 100644 index 000000000..aa6fbe111 --- /dev/null +++ b/sherpa/cpp_api/bin/CMakeLists.txt @@ -0,0 +1,96 @@ +add_executable(sherpa-offline offline-recognizer.cc) +target_link_libraries(sherpa-offline sherpa_cpp_api) + +add_executable(sherpa-online online-recognizer.cc) +target_link_libraries(sherpa-online sherpa_cpp_api) + +if(SHERPA_ENABLE_PORTAUDIO) + add_executable(sherpa-online-microphone online-recognizer-microphone.cc) + target_link_libraries(sherpa-online-microphone sherpa_cpp_api) + if(BUILD_SHARED_LIBS) + target_link_libraries(sherpa-online-microphone portaudio) + else() + target_link_libraries(sherpa-online-microphone portaudio_static) + endif() + + add_executable(sherpa-offline-microphone offline-recognizer-microphone.cc) + target_link_libraries(sherpa-offline-microphone sherpa_cpp_api) + if(BUILD_SHARED_LIBS) + target_link_libraries(sherpa-offline-microphone portaudio) + else() + target_link_libraries(sherpa-offline-microphone portaudio_static) + endif() +endif() + +set(exe_list + sherpa-offline + sherpa-online +) + +if(SHERPA_ENABLE_PORTAUDIO) + list(APPEND exe_list sherpa-online-microphone) + list(APPEND exe_list sherpa-offline-microphone) +endif() + +if(NOT WIN32) + if(NOT DEFINED ENV{VIRTUAL_ENV}) + message(STATUS "Outside a virtual environment") + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "import site; print(';'.join(site.getsitepackages()))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE path_list + ) + else() + message(STATUS "Inside a virtual environment") + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR + ) + set(path_list ${PYTHON_SITE_PACKAGE_DIR}) + endif() + + message(STATUS "path list: ${path_list}") + foreach(p IN LISTS path_list) + foreach(exe IN LISTS exe_list) + target_link_libraries(${exe} "-Wl,-rpath,${p}/sherpa/lib") + target_link_libraries(${exe} "-Wl,-rpath,${p}/../lib") + endforeach() + endforeach() + + foreach(exe IN LISTS exe_list) + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_RPATH_ORIGIN}/../lib") + endforeach() + + # add additional paths + set(additional_paths + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/torch/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/torch/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/k2/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/k2/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/kaldifeat/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/kaldifeat/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/sherpa/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/sherpa/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/torch/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/torch/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/k2/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/k2/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/kaldifeat/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/kaldifeat/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/sherpa/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/sherpa/lib64 + ) + message(STATUS "additional_paths: ${additional_paths}") + foreach(p IN LISTS additional_paths) + foreach(exe IN LISTS exe_list) + target_link_libraries(${exe} "-Wl,-rpath,${p}") + target_link_libraries(${exe} "-Wl,-rpath,${p}") + endforeach() + endforeach() +endif() + +install( + TARGETS ${exe_list} + DESTINATION bin +) diff --git a/sherpa/cpp_api/bin/offline-recognizer-microphone.cc b/sherpa/cpp_api/bin/offline-recognizer-microphone.cc new file mode 100644 index 000000000..ea7cbe5f3 --- /dev/null +++ b/sherpa/cpp_api/bin/offline-recognizer-microphone.cc @@ -0,0 +1,236 @@ +/** + * Copyright 2022-2023 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include // std::tolower +#include // NOLINT +#include // NOLINT + +#include "portaudio.h" // NOLINT +#include "sherpa/cpp_api/offline-recognizer.h" + +class Microphone { + public: + Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-1); + } + } + ~Microphone() { + PaError err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-1); + } + } +}; + +enum class State { + kIdle, + kRecording, + kDecoding, +}; + +State state = State::kIdle; + +// true to stop the program and exit +bool stop = false; + +std::vector samples; +std::mutex samples_mutex; + +static void DetectKeyPress() { + fprintf(stderr, "Press Enter to start"); + int32_t key; + while (!stop && (key = getchar())) { + if (key != 0x0a) { + continue; + } + + switch (state) { + case State::kIdle: + fprintf(stderr, "Start recording. Press Enter to stop recording"); + state = State::kRecording; + { + std::lock_guard lock(samples_mutex); + samples.clear(); + } + break; + case State::kRecording: + fprintf(stderr, "Stop recording. Decoding ..."); + state = State::kDecoding; + break; + case State::kDecoding: + break; + } + } +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + std::lock_guard lock(samples_mutex); + + auto p = reinterpret_cast(input_buffer); + samples.insert(samples.end(), p, p + frames_per_buffer); + + return stop ? paComplete : paContinue; +} + +static void Handler(int32_t sig) { + stop = true; + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n"); +} + +int32_t main(int32_t argc, char *argv[]) { + signal(SIGINT, Handler); + + const char *kUsageMessage = R"usage( +This program uses non-streaming models with microphone for speech recognition. +Usage: + +sherpa-offline-microphone \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false + +See: + + https://k2-fsa.github.io/sherpa/sherpa/pretrained_models/offline_transducer.html + https://k2-fsa.github.io/sherpa/sherpa/pretrained_models/offline_ctc/index.html + +for more details. +)usage"; + if (argc == 1) { + fprintf(stderr, "%s\n", kUsageMessage); + exit(0); + } + + // see + // https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html + torch::set_num_threads(1); + torch::set_num_interop_threads(1); + sherpa::InferenceMode no_grad; + + torch::jit::getExecutorMode() = false; + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(false); + + sherpa::ParseOptions po(kUsageMessage); + sherpa::OfflineRecognizerConfig config; + config.Register(&po); + + po.Read(argc, argv); + config.Validate(); + + fprintf(stderr, "Creating recognizer ..."); + sherpa::OfflineRecognizer recognizer(config); + fprintf(stderr, "Recognizer created!"); + + Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + fprintf(stderr, "Num devices: %d\n", num_devices); + + PaStreamParameters param; + + param.device = Pa_GetDefaultInputDevice(); + if (param.device == paNoDevice) { + fprintf(stderr, "No default input device found\n"); + exit(EXIT_FAILURE); + } + fprintf(stderr, "Use default device: %d\n", param.device); + + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + fprintf(stderr, " Name: %s\n", info->name); + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); + + param.channelCount = 1; + param.sampleFormat = paFloat32; + + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + float sample_rate = 16000; + + PaStream *stream; + PaError err = + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ + sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, nullptr); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + err = Pa_StartStream(stream); + fprintf(stderr, "Started\n"); + + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + std::thread t(DetectKeyPress); + while (!stop) { + switch (state) { + case State::kIdle: + break; + case State::kRecording: + break; + case State::kDecoding: { + std::vector buf; + { + std::lock_guard lock(samples_mutex); + buf = std::move(samples); + } + + auto s = recognizer.CreateStream(); + s->AcceptSamples(buf.data(), buf.size()); + recognizer.DecodeStream(s.get()); + fprintf(stderr, "Decoding Done! Result is:\n"); + fprintf(stderr, "%s\n", s->GetResult().text.c_str()); + + state = State::kIdle; + fprintf(stderr, "Press Enter to start\n"); + break; + } + } + + Pa_Sleep(20); // sleep for 20ms + } + t.join(); + + err = Pa_CloseStream(stream); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/sherpa/csrc/sherpa.cc b/sherpa/cpp_api/bin/offline-recognizer.cc similarity index 65% rename from sherpa/csrc/sherpa.cc rename to sherpa/cpp_api/bin/offline-recognizer.cc index ee3c17418..1fd31969d 100644 --- a/sherpa/csrc/sherpa.cc +++ b/sherpa/cpp_api/bin/offline-recognizer.cc @@ -15,12 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "sherpa/cpp_api/offline-recognizer.h" + +#include // NOLINT + #include "kaldi_native_io/csrc/kaldi-table.h" #include "kaldi_native_io/csrc/text-utils.h" #include "kaldi_native_io/csrc/wave-reader.h" +#include "sherpa/cpp_api/parse-options.h" #include "sherpa/csrc/log.h" -#include "sherpa/csrc/offline_asr.h" -#include "sherpa/csrc/parse_options.h" #include "torch/script.h" static constexpr const char *kUsageMessage = R"( @@ -29,42 +32,34 @@ Offline (non-streaming) automatic speech recognition with sherpa. Usage: (1) View help information. - ./bin/sherpa --help + sherpa-offline --help (2) Use a pretrained model for recognition - ./bin/sherpa \ + sherpa-offline \ --nn-model=/path/to/cpu_jit.pt \ --tokens=/path/to/tokens.txt \ --use-gpu=false \ foo.wav \ bar.wav -Note: You can get pre-trained models for testing by visiting - - Chinese: https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/exp - - English: https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2/tree/main/exp - -Hint: In case you only have `data/lang_bpe_500/bpe.model`, you can use -`./scripts/bpe_model_to_tokens.py /path/to/bpe.model > tokens.txt` to generate -`tokens.txt` from `bpe.model`. - (3) Decode wav.scp - ./bin/sherpa \ + sherpa-offline \ --nn-model=/path/to/cpu_jit.pt \ --tokens=/path/to/tokens.txt \ --use-gpu=false \ - --use-wav-scp=false \ + --use-wav-scp=true \ scp:wav.scp \ ark,scp,t:results.ark,results.scp (4) Decode feats.scp - ./bin/sherpa \ + sherpa-offline \ --nn-model=/path/to/cpu_jit.pt \ --tokens=/path/to/tokens.txt \ --use-gpu=false \ - --use-feats-scp=false \ + --use-feats-scp=true \ scp:feats.scp \ ark,scp,t:results.ark,results.scp @@ -73,6 +68,13 @@ the range [-1, 1), to compute features, while Kaldi uses samples in the range [-32768, 32767] to compute features. If you use `feats.scp` from Kaldi with models from icefall, you won't get expected results. + +See: + + https://k2-fsa.github.io/sherpa/sherpa/pretrained_models/offline_transducer.html + https://k2-fsa.github.io/sherpa/sherpa/pretrained_models/offline_ctc/index.html + +for more details. )"; int main(int argc, char *argv[]) { @@ -80,7 +82,7 @@ int main(int argc, char *argv[]) { // https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html torch::set_num_threads(1); torch::set_num_interop_threads(1); - torch::NoGradGuard no_grad; + sherpa::InferenceMode no_grad; torch::jit::getExecutorMode() = false; torch::jit::getProfilingMode() = false; @@ -93,9 +95,8 @@ int main(int argc, char *argv[]) { int32_t batch_size = 10; sherpa::ParseOptions po(kUsageMessage); - - sherpa::OfflineAsrOptions opts; - opts.Register(&po); + sherpa::OfflineRecognizerConfig config; + config.Register(&po); po.Register("use-wav-scp", &use_wav_scp, "If true, user should provide two arguments: " @@ -116,14 +117,18 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } - opts.Validate(); + config.Validate(); - SHERPA_CHECK_EQ(opts.fbank_opts.frame_opts.samp_freq, expected_sample_rate) + SHERPA_CHECK_EQ(config.feat_config.fbank_opts.frame_opts.samp_freq, + expected_sample_rate) << "The model was trained using training data with sample rate 16000. " << "We don't support resample yet"; - sherpa::OfflineAsr offline_asr(opts); - SHERPA_LOG(INFO) << "\n" << opts.ToString(); + SHERPA_LOG(INFO) << config.ToString(); + + SHERPA_LOG(INFO) << "Creating recognizer ..."; + sherpa::OfflineRecognizer recognizer(config); + SHERPA_LOG(INFO) << "Recognizer created."; if (use_wav_scp) { SHERPA_CHECK_EQ(po.NumArgs(), 2) @@ -152,6 +157,8 @@ int main(int argc, char *argv[]) { std::vector keys; std::vector values; + std::vector> ss; + std::vector p_ss; for (; !wav_reader.Done(); wav_reader.Next()) { keys.push_back(wav_reader.Key()); auto &wave_data = wav_reader.Value(); @@ -169,31 +176,40 @@ int main(int argc, char *argv[]) { } auto tensor = torch::from_blob(const_cast(d.RowData(0)), - {d.NumCols()}, torch::kFloat); - values.push_back(tensor / 32768); - - if (keys.size() >= batch_size) { + {d.NumCols()}, torch::kFloat) / + 32768; + auto s = recognizer.CreateStream(); + s->AcceptSamples(tensor.data_ptr(), tensor.numel()); + ss.push_back(std::move(s)); + p_ss.push_back(ss.back().get()); + + if (static_cast(keys.size()) >= batch_size) { // now for recognition - auto results = offline_asr.DecodeWaves(values); + recognizer.DecodeStreams(p_ss.data(), p_ss.size()); + for (size_t i = 0; i != keys.size(); ++i) { std::vector words; - kaldiio::SplitStringToVector(results[i].text, " ", true, &words); + kaldiio::SplitStringToVector(ss[i]->GetResult().text, " ", true, + &words); writer.Write(keys[i], words); } keys.clear(); - values.clear(); + ss.clear(); + p_ss.clear(); } } if (!keys.empty()) { - auto results = offline_asr.DecodeWaves(values); + recognizer.DecodeStreams(p_ss.data(), p_ss.size()); for (size_t i = 0; i != keys.size(); ++i) { std::vector words; - kaldiio::SplitStringToVector(results[i].text, " ", true, &words); + kaldiio::SplitStringToVector(ss[i]->GetResult().text, " ", true, + &words); writer.Write(keys[i], words); } keys.clear(); - values.clear(); + ss.clear(); + p_ss.clear(); } return 0; @@ -231,54 +247,84 @@ int main(int argc, char *argv[]) { feature_reader(po.GetArg(1)); std::vector keys; std::vector values; + std::vector> ss; + std::vector p_ss; for (; !feature_reader.Done(); feature_reader.Next()) { keys.push_back(feature_reader.Key()); auto &d = feature_reader.Value(); auto tensor = torch::from_blob(const_cast(d.Data()), {d.NumRows(), d.NumCols()}, torch::kFloat); - values.push_back(tensor.clone()); - if (keys.size() >= batch_size) { - // now for recognition - auto results = offline_asr.DecodeFeatures(values); + auto s = recognizer.CreateStream(); + s->AcceptFeatures(tensor.data_ptr(), tensor.size(0), + tensor.size(1)); + ss.push_back(std::move(s)); + p_ss.push_back(ss.back().get()); + + if (static_cast(keys.size()) >= batch_size) { + recognizer.DecodeStreams(p_ss.data(), p_ss.size()); + for (size_t i = 0; i != keys.size(); ++i) { std::vector words; - kaldiio::SplitStringToVector(results[i].text, " ", true, &words); + kaldiio::SplitStringToVector(ss[i]->GetResult().text, " ", true, + &words); writer.Write(keys[i], words); } keys.clear(); - values.clear(); + ss.clear(); + p_ss.clear(); } } if (!keys.empty()) { - auto results = offline_asr.DecodeFeatures(values); + recognizer.DecodeStreams(p_ss.data(), p_ss.size()); for (size_t i = 0; i != keys.size(); ++i) { std::vector words; - kaldiio::SplitStringToVector(results[i].text, " ", true, &words); + kaldiio::SplitStringToVector(ss[i]->GetResult().text, " ", true, + &words); writer.Write(keys[i], words); } keys.clear(); - values.clear(); + ss.clear(); + p_ss.clear(); } return 0; } if (po.NumArgs() == 1) { - auto result = offline_asr.DecodeWave(po.GetArg(1), expected_sample_rate); - - SHERPA_LOG(INFO) << "\nfilename: " << po.GetArg(1) - << "\nresult: " << result.text; + SHERPA_LOG(INFO) << "Started"; + const auto begin = std::chrono::steady_clock::now(); + auto s = recognizer.CreateStream(); + s->AcceptWaveFile(po.GetArg(1)); + recognizer.DecodeStream(s.get()); + + const auto &r = s->GetResult(); + const auto end = std::chrono::steady_clock::now(); + + float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + SHERPA_LOG(INFO) << "Done in " << elapsed_seconds << " seconds"; + std::cerr << "\nfilename: " << po.GetArg(1) << "\ntext: " << r.text + << "\ntoken IDs: " << r.tokens + << "\ntimestamps (after subsampling): " << r.timestamps << "\n"; } else { - std::vector filenames(po.NumArgs()); - for (int i = 1; i <= po.NumArgs(); ++i) { - filenames[i - 1] = po.GetArg(i); + std::vector> ss; + std::vector p_ss; + for (int32_t i = 1; i <= po.NumArgs(); ++i) { + auto s = recognizer.CreateStream(); + s->AcceptWaveFile(po.GetArg(i)); + ss.push_back(std::move(s)); + p_ss.push_back(ss.back().get()); } - auto results = offline_asr.DecodeWaves(filenames, expected_sample_rate); + recognizer.DecodeStreams(p_ss.data(), p_ss.size()); std::ostringstream os; - for (size_t i = 0; i != results.size(); ++i) { - os << "filename: " << filenames[i] << "\n" - << "result: " << results[i].text << "\n\n"; + for (int32_t i = 0; i < po.NumArgs(); ++i) { + const auto &r = ss[i]->GetResult(); + os << "filename: " << po.GetArg(i + 1) << "\n" + << "result: " << r.text << "\n" + << r.AsJsonString() << "\n\n"; } SHERPA_LOG(INFO) << "\n" << os.str(); diff --git a/sherpa/cpp_api/bin/online-recognizer-microphone.cc b/sherpa/cpp_api/bin/online-recognizer-microphone.cc new file mode 100644 index 000000000..150b82efe --- /dev/null +++ b/sherpa/cpp_api/bin/online-recognizer-microphone.cc @@ -0,0 +1,220 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include "portaudio.h" // NOLINT +#include "sherpa/cpp_api/online-recognizer.h" +#include "sherpa/cpp_api/online-stream.h" +#include "sherpa/csrc/fbank-features.h" + +static constexpr const char *kUsageMessage = R"( +Online (streaming) automatic speech recognition with sherpa. + +Usage: +(1) View help information. + + sherpa-online-microphone --help + +(2) Use a pretrained model for recognition + + sherpa-online-microphone \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false \ + --decoding-method=greedy_search + +To use fast_beam_search with an LG, use + + sherpa-online-microphone \ + --decoding-method=fast_beam_search \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --lg=/path/to/LG.pt \ + --use-gpu=false + +(3) To use an LSTM model for recognition + + sherpa-online-microphone \ + --encoder-model=/path/to/encoder_jit_trace.pt \ + --decoder-model=/path/to/decoder_jit_trace.pt \ + --joiner-model=/path/to/joiner_jit_trace.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false + +(4) To use a streaming Zipformer model for recognition + + sherpa-online-microphone + --encoder-model=/path/to/encoder_jit_trace.pt \ + --decoder-model=/path/to/decoder_jit_trace.pt \ + --joiner-model=/path/to/joiner_jit_trace.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false + +See +https://k2-fsa.github.io/sherpa/sherpa/pretrained_models/online_transducer.html +for more details. +)"; + +class Microphone { + public: + Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-1); + } + } + ~Microphone() { + PaError err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-1); + } + } +}; + +bool stop = false; + +static int RecordCallback(const void *input_buffer, void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + auto s = reinterpret_cast(user_data); + auto samples = + torch::from_blob(static_cast(const_cast(input_buffer)), + {static_cast(frames_per_buffer)}, torch::kFloat); + + s->AcceptWaveform(16000, samples); + + return stop ? paComplete : paContinue; +} +static void Handler(int sig) { + stop = true; + fprintf(stderr, "\nexiting...\n"); +} + +int main(int argc, char *argv[]) { + signal(SIGINT, Handler); + + if (argc == 1) { + fprintf(stderr, "%s\n", kUsageMessage); + exit(0); + } + + Microphone mic; + + // see + // https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html + torch::set_num_threads(1); + torch::set_num_interop_threads(1); + sherpa::InferenceMode no_grad; + + torch::jit::getExecutorMode() = false; + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(false); + + sherpa::ParseOptions po(kUsageMessage); + sherpa::OnlineRecognizerConfig config; + config.Register(&po); + + po.Read(argc, argv); + if (argc == 0 || po.NumArgs() != 0) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + config.Validate(); + + float sample_rate = 16000; + if (config.feat_config.fbank_opts.frame_opts.samp_freq != sample_rate) { + std::cerr + << "The model was trained using training data with sample rate 16000. " + << "We don't support resample yet\n"; + exit(EXIT_FAILURE); + } + + sherpa::OnlineRecognizer recognizer(config); + + auto s = recognizer.CreateStream(); + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + fprintf(stderr, "num devices: %d\n", num_devices); + + PaStreamParameters param; + + param.device = Pa_GetDefaultInputDevice(); + if (param.device == paNoDevice) { + fprintf(stderr, "No default input device found\n"); + exit(EXIT_FAILURE); + } + fprintf(stderr, "Use default device: %d\n", param.device); + + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + fprintf(stderr, " Name: %s\n", info->name); + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); + + param.channelCount = 1; + param.sampleFormat = paFloat32; + + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + + PaStream *stream; + PaError err = + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ + sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, s.get()); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + err = Pa_StartStream(stream); + fprintf(stderr, "Started\n"); + + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + int32_t result_len = 0; + while (!stop) { + if (recognizer.IsReady(s.get())) { + recognizer.DecodeStream(s.get()); + auto result = recognizer.GetResult(s.get()).text; + if (static_cast(result.size()) != result_len) { + result_len = result.size(); + fprintf(stderr, "%s\n", result.c_str()); + } + } + + Pa_Sleep(20); // sleep for 20ms + } + + err = Pa_CloseStream(stream); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/sherpa/cpp_api/bin/online-recognizer.cc b/sherpa/cpp_api/bin/online-recognizer.cc new file mode 100644 index 000000000..0f7a7dbd4 --- /dev/null +++ b/sherpa/cpp_api/bin/online-recognizer.cc @@ -0,0 +1,302 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sherpa/cpp_api/online-recognizer.h" + +#include + +#include "kaldi_native_io/csrc/kaldi-table.h" +#include "kaldi_native_io/csrc/text-utils.h" +#include "kaldi_native_io/csrc/wave-reader.h" +#include "sherpa/cpp_api/online-stream.h" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/fbank-features.h" +#include "sherpa/csrc/log.h" + +static constexpr const char *kUsageMessage = R"( +Online (streaming) automatic speech recognition with sherpa. + +Usage: +(1) View help information. + + sherpa-online --help + +(2) Use a pretrained model for recognition + + sherpa-online \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false \ + --decoding-method=greedy_search + foo.wav \ + bar.wav + +To use fast_beam_search with an LG, use + + sherpa-online \ + --decoding-method=fast_beam_search \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --lg=/path/to/LG.pt \ + --use-gpu=false \ + foo.wav \ + bar.wav + +(3) To use an LSTM model for recognition + + sherpa-online \ + --encoder-model=/path/to/encoder_jit_trace.pt \ + --decoder-model=/path/to/decoder_jit_trace.pt \ + --joiner-model=/path/to/joiner_jit_trace.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false \ + foo.wav \ + bar.wav + +(4) To use a streaming Zipformer model for recognition + + sherpa-online \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false \ + foo.wav \ + bar.wav + +(5) To decode wav.scp + + sherpa-online \ + --nn-model=/path/to/cpu_jit.pt \ + --tokens=/path/to/tokens.txt \ + --use-gpu=false \ + --use-wav-scp=true \ + scp:wav.scp \ + ark,scp,t:result.ark,result.scp + +See +https://k2-fsa.github.io/sherpa/sherpa/pretrained_models/online_transducer.html +for more details. +)"; + +int32_t main(int32_t argc, char *argv[]) { + // see + // https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html + torch::set_num_threads(1); + torch::set_num_interop_threads(1); + sherpa::InferenceMode no_grad; + + torch::jit::getExecutorMode() = false; + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(false); + + // All models in icefall use training data with sample rate 16000 + float expected_sample_rate = 16000; + bool use_wav_scp = false; // true to use wav.scp as input + + // Number of seconds for tail padding + float padding_seconds = 0.8; + + sherpa::ParseOptions po(kUsageMessage); + + po.Register("use-wav-scp", &use_wav_scp, + "If true, user should provide two arguments: " + "scp:wav.scp ark,scp,t:results.ark,results.scp"); + + po.Register("padding-seconds", &padding_seconds, + "Number of seconds for tail padding."); + + sherpa::OnlineRecognizerConfig config; + config.Register(&po); + + po.Read(argc, argv); + if (po.NumArgs() < 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + config.Validate(); + + SHERPA_CHECK_EQ(config.feat_config.fbank_opts.frame_opts.samp_freq, + expected_sample_rate) + << "The model was trained using training data with sample rate 16000. " + << "We don't support resample yet"; + + SHERPA_CHECK_GE(po.NumArgs(), 1); + + SHERPA_CHECK_GE(padding_seconds, 0); + + SHERPA_LOG(INFO) << "decoding method: " << config.decoding_method; + + torch::Tensor tail_padding = torch::zeros( + {static_cast(padding_seconds * expected_sample_rate)}, + torch::kFloat); + + sherpa::OnlineRecognizer recognizer(config); + if (use_wav_scp) { + SHERPA_CHECK_EQ(po.NumArgs(), 2) + << "Please use something like:\n" + << "scp:wav.scp ark,scp,t:results.scp,results.ark\n" + << "if you provide --use-wav-scp=true"; + + if (kaldiio::ClassifyRspecifier(po.GetArg(1), nullptr, nullptr) == + kaldiio::kNoRspecifier) { + SHERPA_LOG(FATAL) << "Please provide an rspecifier. Current value is: " + << po.GetArg(1); + } + + if (kaldiio::ClassifyWspecifier(po.GetArg(2), nullptr, nullptr, nullptr) == + kaldiio::kNoWspecifier) { + SHERPA_LOG(FATAL) << "Please provide a wspecifier. Current value is: " + << po.GetArg(2); + } + + kaldiio::TableWriter writer(po.GetArg(2)); + + kaldiio::SequentialTableReader wav_reader( + po.GetArg(1)); + + int32_t num_decoded = 0; + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string key = wav_reader.Key(); + SHERPA_LOG(INFO) << "\n" << num_decoded++ << ": decoding " << key; + auto &wave_data = wav_reader.Value(); + if (wave_data.SampFreq() != expected_sample_rate) { + SHERPA_LOG(FATAL) << wav_reader.Key() + << "is expected to have sample rate " + << expected_sample_rate << ". Given " + << wave_data.SampFreq(); + } + auto &d = wave_data.Data(); + if (d.NumRows() > 1) { + SHERPA_LOG(WARNING) + << "Only the first channel from " << wav_reader.Key() << " is used"; + } + + auto tensor = torch::from_blob(const_cast(d.RowData(0)), + {d.NumCols()}, torch::kFloat) / + 32768; + auto s = recognizer.CreateStream(); + s->AcceptWaveform(expected_sample_rate, tensor); + s->AcceptWaveform(expected_sample_rate, tail_padding); + s->InputFinished(); + + while (recognizer.IsReady(s.get())) { + recognizer.DecodeStream(s.get()); + } + auto result = recognizer.GetResult(s.get()); + + SHERPA_LOG(INFO) << "\nresult: " << result.text; + + std::vector words; + kaldiio::SplitStringToVector(result.text, " ", true, &words); + writer.Write(key, words); + } + } else { + int32_t num_waves = po.NumArgs(); + if (num_waves == 1) { + // simulate streaming + torch::Tensor wave = + sherpa::ReadWave(po.GetArg(1), expected_sample_rate).first; + + auto s = recognizer.CreateStream(); + + int32_t chunk = 0.2 * expected_sample_rate; + int32_t num_samples = wave.numel(); + + std::string last; + for (int32_t start = 0; start < num_samples;) { + int32_t end = std::min(start + chunk, num_samples); + torch::Tensor samples = + wave.index({torch::indexing::Slice(start, end)}); + start = end; + + s->AcceptWaveform(expected_sample_rate, samples); + + while (recognizer.IsReady(s.get())) { + recognizer.DecodeStream(s.get()); + } + + auto r = recognizer.GetResult(s.get()); + + if (!r.text.empty() && r.text != last) { + last = r.text; + std::cout << r.AsJsonString() << "\n"; + } + } + + s->AcceptWaveform(expected_sample_rate, tail_padding); + s->InputFinished(); + while (recognizer.IsReady(s.get())) { + recognizer.DecodeStream(s.get()); + } + auto r = recognizer.GetResult(s.get()); + + if (!r.text.empty() && r.text != last) { + last = r.text; + std::cout << r.AsJsonString() << ", size: " << r.text.size() << "\n"; + } + } else { + // For multiple waves, we don't use simulate streaming since + // it would complicate the code. Please use + // sherpa-online-websocket-server and + // sherpa-online-websocket-client and for that. + std::vector> ss; + std::vector p_ss; + + for (int32_t i = 1; i <= po.NumArgs(); ++i) { + auto s = recognizer.CreateStream(); + + torch::Tensor wave = + sherpa::ReadWave(po.GetArg(i), expected_sample_rate).first; + + s->AcceptWaveform(expected_sample_rate, wave); + + s->AcceptWaveform(expected_sample_rate, tail_padding); + s->InputFinished(); + ss.push_back(std::move(s)); + p_ss.push_back(ss.back().get()); + } + + std::vector ready_streams; + for (;;) { + ready_streams.clear(); + for (auto s : p_ss) { + if (recognizer.IsReady(s)) { + ready_streams.push_back(s); + } + } + + if (ready_streams.empty()) { + break; + } + recognizer.DecodeStreams(ready_streams.data(), ready_streams.size()); + } + + std::ostringstream os; + for (int32_t i = 1; i <= po.NumArgs(); ++i) { + os << po.GetArg(i) << "\n"; + auto r = recognizer.GetResult(p_ss[i - 1]); + os << r.text << "\n"; + os << r.AsJsonString() << "\n\n"; + } + + std::cerr << os.str(); + } + } + + return 0; +} diff --git a/sherpa/cpp_api/endpoint.cc b/sherpa/cpp_api/endpoint.cc new file mode 100644 index 000000000..e90e6a62c --- /dev/null +++ b/sherpa/cpp_api/endpoint.cc @@ -0,0 +1,103 @@ +/** + * Copyright 2022 (authors: Pingfeng Luo) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "sherpa/cpp_api/endpoint.h" + +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +static bool RuleActivated(const EndpointRule &rule, + const std::string &rule_name, float trailing_silence, + float utterance_length) { + bool contain_nonsilence = utterance_length > trailing_silence; + bool ans = (contain_nonsilence || !rule.must_contain_nonsilence) && + trailing_silence >= rule.min_trailing_silence && + utterance_length >= rule.min_utterance_length; + if (ans) { + SHERPA_LOG(DEBUG) << "Endpointing rule " << rule_name << " activated: " + << (contain_nonsilence ? "true" : "false") << ',' + << trailing_silence << ',' << utterance_length; + } + return ans; +} + +static void RegisterEndpointRule(ParseOptions *po, EndpointRule *rule, + const std::string &rule_name) { + po->Register( + rule_name + "-must-contain-nonsilence", &rule->must_contain_nonsilence, + "If True, for this endpointing " + rule_name + + " to apply there must be nonsilence in the best-path traceback. " + "For decoding, a non-blank token is considered as non-silence"); + po->Register(rule_name + "-min-trailing-silence", &rule->min_trailing_silence, + "This endpointing " + rule_name + + " requires duration of trailing silence in seconds) to " + "be >= this value."); + po->Register(rule_name + "-min-utterance-length", &rule->min_utterance_length, + "This endpointing " + rule_name + + " requires utterance-length (in seconds) to be >= this " + "value."); +} + +std::string EndpointRule::ToString() const { + std::ostringstream os; + + os << "EndpointRule("; + os << "must_contain_nonsilence=" + << (must_contain_nonsilence ? "True" : "False") << ", "; + os << "min_trailing_silence=" << min_trailing_silence << ", "; + os << "min_utterance_length=" << min_utterance_length << ")"; + + return os.str(); +} + +void EndpointConfig::Register(ParseOptions *po) { + RegisterEndpointRule(po, &rule1, "rule1"); + RegisterEndpointRule(po, &rule2, "rule2"); + RegisterEndpointRule(po, &rule3, "rule3"); +} + +std::string EndpointConfig::ToString() const { + std::ostringstream os; + + os << "EndpointConfig("; + os << "rule1=" << rule1.ToString() << ", "; + os << "rule2=" << rule2.ToString() << ", "; + os << "rule3=" << rule3.ToString() << ")"; + + return os.str(); +} + +bool Endpoint::IsEndpoint(int num_frames_decoded, int trailing_silence_frames, + float frame_shift_in_seconds) const { + float utterance_length = num_frames_decoded * frame_shift_in_seconds; + float trailing_silence = trailing_silence_frames * frame_shift_in_seconds; + if (RuleActivated(config_.rule1, "rule1", trailing_silence, + utterance_length) || + RuleActivated(config_.rule2, "rule2", trailing_silence, + utterance_length) || + RuleActivated(config_.rule3, "rule3", trailing_silence, + utterance_length)) { + return true; + } + return false; +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/endpoint.h b/sherpa/cpp_api/endpoint.h new file mode 100644 index 000000000..8ce714ca9 --- /dev/null +++ b/sherpa/cpp_api/endpoint.h @@ -0,0 +1,83 @@ +/** + * Copyright 2022 (authors: Pingfeng Luo) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SHERPA_CPP_API_ENDPOINT_H_ +#define SHERPA_CPP_API_ENDPOINT_H_ + +#include +#include + +namespace sherpa { + +struct EndpointRule { + // If True, for this endpointing rule to apply there must + // be nonsilence in the best-path traceback. + // For decoding, a non-blank token is considered as non-silence + bool must_contain_nonsilence = true; + // This endpointing rule requires duration of trailing silence + // (in seconds) to be >= this value. + float min_trailing_silence = 2.0; + // This endpointing rule requires utterance-length (in seconds) + // to be >= this value. + float min_utterance_length = 0.0f; + + EndpointRule() = default; + EndpointRule(bool must_contain_nonsilence, float min_trailing_silence, + float min_utterance_length) + : must_contain_nonsilence(must_contain_nonsilence), + min_trailing_silence(min_trailing_silence), + min_utterance_length(min_utterance_length) {} + + std::string ToString() const; +}; + +class ParseOptions; + +struct EndpointConfig { + // For default setting, + // rule1 times out after 2.4 seconds of silence, even if we decoded nothing. + // rule2 times out after 1.2 seconds of silence after decoding something. + // rule3 times out after the utterance is 20 seconds long, regardless of + // anything else. + EndpointRule rule1; + EndpointRule rule2; + EndpointRule rule3; + + void Register(ParseOptions *po); + + EndpointConfig() + : rule1{false, 2.4, 0}, rule2{true, 1.2, 0}, rule3{false, 0, 20} {} + + std::string ToString() const; +}; + +class Endpoint { + public: + explicit Endpoint(const EndpointConfig &config) : config_(config) {} + + /// This function returns true if this set of endpointing rules thinks we + /// should terminate decoding. + bool IsEndpoint(int num_frames_decoded, int trailing_silence_frames, + float frame_shift_in_seconds) const; + + private: + EndpointConfig config_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_ENDPOINT_H_ diff --git a/sherpa/cpp_api/fast-beam-search-config.cc b/sherpa/cpp_api/fast-beam-search-config.cc new file mode 100644 index 000000000..2037de4c5 --- /dev/null +++ b/sherpa/cpp_api/fast-beam-search-config.cc @@ -0,0 +1,46 @@ +// sherpa/cpp_api/fast-beam-search-config.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/cpp_api/fast-beam-search-config.h" + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +// TODO(fangjun): Add a prefix for it +void FastBeamSearchConfig::Register(ParseOptions *po) { + po->Register("lg", &lg, + "Path to LG.pt. Used only for fast_beam_search " + "in transducer decoding"); + + po->Register("ngram-lm-scale", &ngram_lm_scale, + "Scale the scores from LG.pt. Used only for fast_beam_search " + "in transducer decoding"); + + po->Register("beam", &beam, "Beam used in fast_beam_search"); +} + +void FastBeamSearchConfig::Validate() const { + if (!lg.empty()) { + AssertFileExists(lg); + } + SHERPA_CHECK_GE(ngram_lm_scale, 0); + SHERPA_CHECK_GT(beam, 0); +} + +std::string FastBeamSearchConfig::ToString() const { + std::ostringstream os; + + os << "FastBeamSearchConfig("; + os << "lg=\"" << lg << "\", "; + os << "ngram_lm_scale=" << ngram_lm_scale << ", "; + os << "beam=" << beam << ", "; + os << "max_states=" << max_states << ", "; + os << "max_contexts=" << max_contexts << ", "; + os << "allow_partial=" << (allow_partial ? "True" : "False") << ")"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/fast-beam-search-config.h b/sherpa/cpp_api/fast-beam-search-config.h new file mode 100644 index 000000000..efc7192d7 --- /dev/null +++ b/sherpa/cpp_api/fast-beam-search-config.h @@ -0,0 +1,38 @@ +// sherpa/cpp_api/fast-beam-search-config.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CPP_API_FAST_BEAM_SEARCH_CONFIG_H_ +#define SHERPA_CPP_API_FAST_BEAM_SEARCH_CONFIG_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" + +namespace sherpa { + +// For transducer decoding with a graph +struct FastBeamSearchConfig { + // If not empty, it is the filename of LG.pt + // If empty, we use a trivial graph in decoding. + std::string lg; + + // If lg is not empty, lg.scores is scaled by this value + float ngram_lm_scale = 0.01; + + // A floating point value to calculate the cutoff score during beam + // search (i.e., `cutoff = max-score - beam`), which is the same as the + // `beam` in Kaldi. + float beam = 20.0; + int32_t max_states = 64; + int32_t max_contexts = 8; + bool allow_partial = false; + + void Register(ParseOptions *po); + + void Validate() const; + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_FAST_BEAM_SEARCH_CONFIG_H_ diff --git a/sherpa/cpp_api/feature-config.cc b/sherpa/cpp_api/feature-config.cc new file mode 100644 index 000000000..2f3d0a2a8 --- /dev/null +++ b/sherpa/cpp_api/feature-config.cc @@ -0,0 +1,86 @@ +// sherpa/cpp_api/feature-config.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/feature-config.h" + +#include + +namespace sherpa { + +static void RegisterFrameExtractionOptions( + ParseOptions *po, kaldifeat::FrameExtractionOptions *opts) { + po->Register("sample-frequency", &opts->samp_freq, + "Waveform data sample frequency (must match the waveform file, " + "if specified there)"); + + po->Register("frame-length", &opts->frame_length_ms, + "Frame length in milliseconds"); + + po->Register("frame-shift", &opts->frame_shift_ms, + "Frame shift in milliseconds"); + + po->Register( + "dither", &opts->dither, + "Dithering constant (0.0 means no dither). " + "Caution: Samples are normalized to the range [-1, 1). " + "Please select a small value for dither if you want to enable it"); + + po->Register( + "snip-edges", &opts->snip_edges, + "If true, end effects will be handled by outputting only frames that " + "completely fit in the file, and the number of frames depends on the " + "frame-length. If false, the number of frames depends only on the " + "frame-shift, and we reflect the data at the ends."); +} + +static void RegisterMelBanksOptions(ParseOptions *po, + kaldifeat::MelBanksOptions *opts) { + po->Register("num-mel-bins", &opts->num_bins, + "Number of triangular mel-frequency bins"); + po->Register( + "high-freq", &opts->high_freq, + "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)"); +} + +void FeatureConfig::Register(ParseOptions *po) { + fbank_opts.frame_opts.dither = 0; + RegisterFrameExtractionOptions(po, &fbank_opts.frame_opts); + + fbank_opts.mel_opts.num_bins = 80; + RegisterMelBanksOptions(po, &fbank_opts.mel_opts); + + fbank_opts.mel_opts.high_freq = -400; + fbank_opts.frame_opts.remove_dc_offset = true; + fbank_opts.frame_opts.round_to_power_of_two = true; + fbank_opts.energy_floor = 1e-10; + fbank_opts.frame_opts.snip_edges = false; + fbank_opts.frame_opts.samp_freq = 16000; + po->Register("normalize-samples", &normalize_samples, + "true to use samples in the range [-1, 1]. " + "false to use samples in the range [-32768, 32767]. " + "Note: kaldi uses un-normalized samples."); + + po->Register( + "nemo-normalize", &nemo_normalize, + "See " + "https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/" + "preprocessing/features.py#L59" + "Current supported value: per_feature or leave it to empty (unset)"); +} + +std::string FeatureConfig::ToString() const { + std::ostringstream os; + os << "FeatureConfig("; + os << "fbank_opts=" << fbank_opts.ToString() << ", "; + os << "normalize_samples=" << (normalize_samples ? "True" : "False") << ", "; + os << "nemo_normalize=\"" << nemo_normalize << "\")"; + return os.str(); +} + +std::ostream &operator<<(std::ostream &os, const FeatureConfig &config) { + os << config.ToString(); + return os; +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/feature-config.h b/sherpa/cpp_api/feature-config.h new file mode 100644 index 000000000..c34383fdd --- /dev/null +++ b/sherpa/cpp_api/feature-config.h @@ -0,0 +1,56 @@ +// sherpa/cpp_api/feature-config.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CPP_API_FEATURE_CONFIG_H_ +#define SHERPA_CPP_API_FEATURE_CONFIG_H_ + +#include + +#include "kaldifeat/csrc/feature-fbank.h" +#include "sherpa/cpp_api/parse-options.h" + +namespace sherpa { + +struct FeatureConfig { + kaldifeat::FbankOptions fbank_opts; + + // In sherpa, we always assume the input audio samples are normalized to + // the range [-1, 1]. + // ``normalize_samples`` determines how we transform the input samples + // inside sherpa. + // If true, we don't do anything to the input audio samples and use them + // as they are. + // + // If false, we scale the input samples by 32767 inside sherpa + bool normalize_samples = true; + + // For Wav2Vec 2.0, we set it to true so that it returns audio samples + // directly. + // + // The user does not need to set it. We set it internally when we + // load a Wav2Vec 2.0 model. + bool return_waveform = false; + + // For models from NeMo + // Possible values: + // - per_feature + // - all_features (not implemented yet) + // - fixed_mean (not implemented) + // - fixed_std (not implemented) + // - or just leave it to empty + // See + // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/preprocessing/features.py#L59 + // for details + std::string nemo_normalize; + + void Register(ParseOptions *po); + + /** A string representation for debugging purpose. */ + std::string ToString() const; +}; + +std::ostream &operator<<(std::ostream &os, const FeatureConfig &config); + +} // namespace sherpa + +#endif // SHERPA_CPP_API_FEATURE_CONFIG_H_ diff --git a/sherpa/cpp_api/grpc/CMakeLists.txt b/sherpa/cpp_api/grpc/CMakeLists.txt new file mode 100644 index 000000000..2fafc9410 --- /dev/null +++ b/sherpa/cpp_api/grpc/CMakeLists.txt @@ -0,0 +1,117 @@ +add_definitions(-DASIO_STANDALONE) + +# compile sherpo.proto +set(PROTO_DIR "${CMAKE_CURRENT_BINARY_DIR}") +set(PROTO_IN "${CMAKE_CURRENT_SOURCE_DIR}") +set(grpc_BINARY_DIR ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +set(grpc_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) +include_directories(${CMAKE_BINARY_DIR}) +add_custom_command( + OUTPUT ${PROTO_DIR}/sherpa.pb.cc + ${PROTO_DIR}/sherpa.pb.h + ${PROTO_DIR}/sherpa.grpc.pb.cc + ${PROTO_DIR}/sherpa.grpc.pb.h + COMMAND ${grpc_BINARY_DIR}/protoc + ARGS --grpc_out "${PROTO_DIR}" + --cpp_out "${PROTO_DIR}" + -I "${PROTO_IN}" + --plugin=protoc-gen-grpc=${grpc_BINARY_DIR}/grpc_cpp_plugin + sherpa.proto) + +add_executable(sherpa-online-grpc-server + online-grpc-server.cc + online-grpc-server-impl.cc + ${PROTO_DIR}/sherpa.pb.cc + ${PROTO_DIR}/sherpa.grpc.pb.cc +) +target_link_libraries(sherpa-online-grpc-server sherpa_cpp_api grpc++ grpc++_reflection) + +if(NOT WIN32) + target_link_libraries(sherpa-online-grpc-server -pthread) + target_compile_options(sherpa-online-grpc-server PRIVATE -Wno-deprecated-declarations) +endif() + +add_executable(sherpa-online-grpc-client + online-grpc-client.cc + online-grpc-client-impl.cc + ${PROTO_DIR}/sherpa.pb.cc + ${PROTO_DIR}/sherpa.grpc.pb.cc +) + +target_link_libraries(sherpa-online-grpc-client + sherpa_core + kaldi_native_io_core + grpc++ + grpc++_reflection +) + +if(NOT WIN32) + target_link_libraries(sherpa-online-grpc-client -pthread) +endif() + +set(bins + sherpa-online-grpc-server + sherpa-online-grpc-client +) + +if(NOT WIN32) + if(NOT DEFINED ENV{VIRTUAL_ENV}) + message(STATUS "Outside a virtual environment") + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "import site; print(';'.join(site.getsitepackages()))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE path_list + ) + else() + message(STATUS "Inside a virtual environment") + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR + ) + set(path_list ${PYTHON_SITE_PACKAGE_DIR}) + endif() + + message(STATUS "path list: ${path_list}") + foreach(p IN LISTS path_list) + foreach(exe IN LISTS bins) + target_link_libraries(${exe} "-Wl,-rpath,${p}/sherpa/lib") + target_link_libraries(${exe} "-Wl,-rpath,${p}/../lib") + endforeach() + endforeach() + + foreach(exe IN LISTS bins) + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_RPATH_ORIGIN}/../lib") + endforeach() + + # add additional paths + set(additional_paths + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/torch/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/torch/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/k2/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/k2/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/kaldifeat/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/kaldifeat/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/sherpa/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/sherpa/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/torch/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/torch/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/k2/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/k2/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/kaldifeat/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/kaldifeat/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/sherpa/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/sherpa/lib64 + ) + message(STATUS "additional_paths: ${additional_paths}") + foreach(p IN LISTS additional_paths) + foreach(exe IN LISTS bins) + target_link_libraries(${exe} "-Wl,-rpath,${p}") + target_link_libraries(${exe} "-Wl,-rpath,${p}") + endforeach() + endforeach() +endif() + +install(TARGETS ${bins} + DESTINATION bin +) diff --git a/sherpa/cpp_api/grpc/online-grpc-client-impl.cc b/sherpa/cpp_api/grpc/online-grpc-client-impl.cc new file mode 100644 index 000000000..032616038 --- /dev/null +++ b/sherpa/cpp_api/grpc/online-grpc-client-impl.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) +// 2023 y00281951 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sherpa/cpp_api/grpc/online-grpc-client-impl.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { +using grpc::Channel; +using grpc::ClientContext; +using grpc::ClientReaderWriter; +using grpc::Status; + +GrpcClient::GrpcClient(const std::string& host, + int32_t port, + int32_t nbest, + const std::string& reqid) + : host_(host), + port_(port), + nbest_(nbest), + reqid_(reqid) { + Connect(); + t_ = std::make_unique(&GrpcClient::ReadLoopFunc, this); +} + +void GrpcClient::Connect() { + channel_ = grpc::CreateChannel(host_ + ":" + std::to_string(port_), + grpc::InsecureChannelCredentials()); + stub_ = ASR::NewStub(channel_); + context_ = std::make_unique(); + stream_ = stub_->Recognize(context_.get()); + request_ = std::make_unique(); + response_ = std::make_unique(); + request_->mutable_decode_config()->set_nbest_config(nbest_); + request_->mutable_decode_config()->set_reqid(reqid_); + stream_->Write(*request_); +} + +void GrpcClient::SendBinaryData(const void* data, size_t size) { + const int16_t* pdata = reinterpret_cast(data); + request_->set_audio_data(pdata, size); + stream_->Write(*request_); +} + +void GrpcClient::ReadLoopFunc() { + try { + while (stream_->Read(response_.get())) { + for (int32_t i = 0; i < response_->nbest_size(); i++) { + // you can also traverse wordpieces like demonstrated above + SHERPA_LOG(INFO) << i + 1 << "best " << response_->nbest(i).sentence(); + } + if (response_->status() != Response_Status_ok) { + break; + } + if (response_->type() == Response_Type_speech_end) { + done_ = true; + break; + } + } + } catch (std::exception const& e) { + SHERPA_LOG(ERROR) << e.what(); + } +} + +void GrpcClient::Join() { + stream_->WritesDone(); + t_->join(); + Status status = stream_->Finish(); + if (!status.ok()) { + SHERPA_LOG(INFO) << "Recognize rpc failed."; + } +} +} // namespace sherpa + diff --git a/sherpa/cpp_api/grpc/online-grpc-client-impl.h b/sherpa/cpp_api/grpc/online-grpc-client-impl.h new file mode 100644 index 000000000..aa9d4ccec --- /dev/null +++ b/sherpa/cpp_api/grpc/online-grpc-client-impl.h @@ -0,0 +1,72 @@ +// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) +// 2023 y00281951 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SHERPA_CPP_API_GRPC_ONLINE_GRPC_CLIENT_IMPL_H_ +#define SHERPA_CPP_API_GRPC_ONLINE_GRPC_CLIENT_IMPL_H_ + +#include +#include +#include +#include // NOLINT + +#include "grpc/grpc.h" +#include "grpcpp/channel.h" +#include "grpcpp/client_context.h" +#include "grpcpp/create_channel.h" + +#include "sherpa/csrc/log.h" +#include "sherpa/cpp_api/grpc/sherpa.grpc.pb.h" + +namespace sherpa { + +using grpc::Channel; +using grpc::ClientContext; +using grpc::ClientReaderWriter; + +class GrpcClient { + public: + GrpcClient(const std::string& host, + int32_t port, + int32_t nbest, + const std::string& reqid); + + void SendBinaryData(const void* data, size_t size); + void SetKey(const std::string& key) { key_ = key; } + void Join(); + bool Done() const { return done_; } + + + private: + void ReadLoopFunc(); + void Connect(); + std::string host_; + int32_t port_; + int32_t nbest_; + std::string reqid_; + std::string key_; + bool done_ = false; + + std::shared_ptr channel_; + std::unique_ptr stub_; + std::unique_ptr context_; + std::unique_ptr> stream_; + std::unique_ptr request_; + std::unique_ptr response_; + std::unique_ptr t_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_GRPC_ONLINE_GRPC_CLIENT_IMPL_H_ diff --git a/sherpa/cpp_api/grpc/online-grpc-client.cc b/sherpa/cpp_api/grpc/online-grpc-client.cc new file mode 100644 index 000000000..7f387a072 --- /dev/null +++ b/sherpa/cpp_api/grpc/online-grpc-client.cc @@ -0,0 +1,181 @@ +// sherpa/cpp_api/grpc/online-grpc-client.cc +// +// Copyright (c) 2023 y00281951 + +#include // NOLINT +#include +#include +#include +#include +#include + +#include "grpc/grpc.h" +#include "grpcpp/channel.h" +#include "grpcpp/client_context.h" +#include "grpcpp/create_channel.h" + +#include "kaldi_native_io/csrc/kaldi-table.h" +#include "kaldi_native_io/csrc/text-utils.h" +#include "kaldi_native_io/csrc/wave-reader.h" + +#include "sherpa/csrc/fbank-features.h" +#include "sherpa/csrc/log.h" +#include "sherpa/csrc/file-utils.h" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/cpp_api/grpc/online-grpc-client-impl.h" + +#define EXPECTED_SAMPLE_RATE 16000 + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using grpc. + +Usage: + +sherpa-online-grpc-client --help + +sherpa-online-grpc-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + path/to/foo.wav \ + path/to/bar.wav + +or + +sherpa-online-grpc-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + --use-wav-scp=true \ + scp:wav.scp \ + ark,scp,t:results.ark,results.scp +)"; + +static void RecoginzeWav(const std::string& server_ip, int32_t server_port, + const std::string& req_id, const std::string& key, + const kaldiio::Matrix &wav_data, + float interval) { + int32_t nbest = 1; + const int32_t num_samples = wav_data.NumCols(); + const int32_t sample_interval = interval * EXPECTED_SAMPLE_RATE; + + sherpa::GrpcClient client(server_ip, server_port, nbest, req_id); + client.SetKey(key); + + for (int32_t start = 0; start < num_samples; start += sample_interval) { + if (client.Done()) { + break; + } + int32_t end = std::min(start + sample_interval, num_samples); + // Convert to short + std::vector data; + data.reserve(end - start); + for (int32_t j = start; j < end; j++) { + data.push_back(static_cast(wav_data(0, j))); + } + // Send PCM data + client.SendBinaryData(data.data(), data.size() * sizeof(int16_t)); + SHERPA_LOG(INFO) << req_id << "Send " << data.size() << " samples"; + std::this_thread::sleep_for( + std::chrono::milliseconds(static_cast(interval * 1000))); + } + client.Join(); +} + +int32_t main(int32_t argc, char* argv[]) { + std::string server_ip = "127.0.0.1"; + int32_t server_port = 6006; + bool use_wav_scp = false; // true to use wav.scp as input + + sherpa::ParseOptions po(kUsageMessage); + + po.Register("server-ip", &server_ip, "IP address of the grpc server"); + po.Register("server-port", &server_port, "Port of the grpc server"); + po.Register("use-wav-scp", &use_wav_scp, + "If true, user should provide two arguments: " + "scp:wav.scp ark,scp,t:results.ark,results.scp"); + + po.Read(argc, argv); + + if (server_port <= 0 || server_port > 65535) { + SHERPA_LOG(FATAL) << "Invalid server port: " << server_port; + } + + if (po.NumArgs() < 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + std::random_device rd; + std::mt19937 gen(rd()); + const float interval = 0.02; + + if (use_wav_scp) { + SHERPA_CHECK_EQ(po.NumArgs(), 2) + << "Please use something like:\n" + << "scp:wav.scp ark,scp,t:results.scp,results.ark\n" + << "if you provide --use-wav-scp=true"; + + if (kaldiio::ClassifyRspecifier(po.GetArg(1), nullptr, nullptr) == + kaldiio::kNoRspecifier) { + SHERPA_LOG(FATAL) << "Please provide an rspecifier. Current value is: " + << po.GetArg(1); + } + + if (kaldiio::ClassifyWspecifier(po.GetArg(2), nullptr, nullptr, nullptr) == + kaldiio::kNoWspecifier) { + SHERPA_LOG(FATAL) << "Please provide a wspecifier. Current value is: " + << po.GetArg(2); + } + + kaldiio::TableWriter writer(po.GetArg(2)); + + kaldiio::SequentialTableReader wav_reader( + po.GetArg(1)); + + int32_t num_decoded = 0; + for (; !wav_reader.Done(); wav_reader.Next()) { + const std::string request_id = std::to_string(gen()); + + SHERPA_LOG(INFO) << "\n" << num_decoded++ << ": decoding " + << wav_reader.Key(); + const auto &wave_data = wav_reader.Value(); + if (wave_data.SampFreq() != EXPECTED_SAMPLE_RATE) { + SHERPA_LOG(FATAL) << wav_reader.Key() + << "is expected to have sample rate " + << EXPECTED_SAMPLE_RATE << ". Given " + << wave_data.SampFreq(); + } + const auto &d = wave_data.Data(); + if (d.NumRows() > 1) { + SHERPA_LOG(WARNING) + << "Only the first channel from " << wav_reader.Key() << " is used"; + } + RecoginzeWav(server_ip, server_port, request_id, + wav_reader.Key(), d, interval); + } + } else { + for (int32_t i = 1; i <= po.NumArgs(); ++i) { + const std::string request_id = std::to_string(gen()); + bool binary = true; + kaldiio::Input ki(po.GetArg(i), &binary); + kaldiio::WaveHolder wh; + if (!wh.Read(ki.Stream())) { + SHERPA_LOG(FATAL) << "Failed to read " << po.GetArg(i); + } + auto &wave_data = wh.Value(); + if (wave_data.SampFreq() != EXPECTED_SAMPLE_RATE) { + SHERPA_LOG(FATAL) << po.GetArg(i) + << "is expected to have sample rate " + << EXPECTED_SAMPLE_RATE << ". Given " + << wave_data.SampFreq(); + } + const auto &d = wave_data.Data(); + if (d.NumRows() > 1) { + SHERPA_LOG(WARNING) + << "Only the first channel from " << po.GetArg(i) << " is used"; + } + RecoginzeWav(server_ip, server_port, request_id, + po.GetArg(i), d, interval); + } + } + return 0; +} diff --git a/sherpa/cpp_api/grpc/online-grpc-server-impl.cc b/sherpa/cpp_api/grpc/online-grpc-server-impl.cc new file mode 100644 index 000000000..8f4f8e69a --- /dev/null +++ b/sherpa/cpp_api/grpc/online-grpc-server-impl.cc @@ -0,0 +1,298 @@ +// sherpa/cpp_api/grpc/online-grpc-server-impl.cc +// +// Copyright (c) 2022 Xiaomi Corporation +// 2023 y00281951 + +#include "sherpa/cpp_api/grpc/online-grpc-server-impl.h" +#include "sherpa/csrc/log.h" + +#define SHERPA_SLEEP_TIME 100 +#define SHERPA_SLEEP_ROUND_MAX 3000 + +namespace sherpa { +using grpc::ServerContext; +using grpc::ServerReaderWriter; + +void OnlineGrpcDecoderConfig::Register(ParseOptions *po) { + recognizer_config.Register(po); + + po->Register("loop-interval-ms", &loop_interval_ms, + "It determines how often the decoder loop runs. "); + + po->Register("max-batch-size", &max_batch_size, + "Max batch size for recognition."); + + po->Register("padding-seconds", &padding_seconds, + "Num of seconds for tail padding."); +} + +void OnlineGrpcDecoderConfig::Validate() const { + recognizer_config.Validate(); + SHERPA_CHECK_GT(loop_interval_ms, 0); + SHERPA_CHECK_GT(max_batch_size, 0); + SHERPA_CHECK_GT(padding_seconds, 0); +} + +void OnlineGrpcServerConfig::Register(ParseOptions *po) { + decoder_config.Register(po); +} + +void OnlineGrpcServerConfig::Validate() const { + decoder_config.Validate(); +} + +OnlineGrpcDecoder::OnlineGrpcDecoder(OnlineGrpcServer *server) + : server_(server), + config_(server->GetConfig().decoder_config), + timer_(server->GetWorkContext()) { + recognizer_ = std::make_unique(config_.recognizer_config); +} + +void OnlineGrpcDecoder::SerializeResult(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + auto result = recognizer_->GetResult(c->s.get()); + c->response->clear_nbest(); + Response_OneBest* one_best = c->response->add_nbest(); + one_best->set_sentence(result.text); +} + +void OnlineGrpcDecoder::OnPartialResult(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + if (!c->finish_flag) { + c->response->set_status(Response::ok); + c->response->set_type(Response::partial_result); + c->stream->Write(*c->response); + } +} + +void OnlineGrpcDecoder::OnFinalResult(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + if (!c->finish_flag) { + c->response->set_status(Response::ok); + c->response->set_type(Response::final_result); + c->stream->Write(*c->response); + } +} + +void OnlineGrpcDecoder::OnSpeechEnd(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + if (!c->finish_flag) { + c->response->set_status(Response::ok); + c->response->set_type(Response::speech_end); + c->stream->Write(*c->response); + } + c->finish_flag = true; +} + +void OnlineGrpcDecoder::AcceptWaveform(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + float sample_rate = + config_.recognizer_config.feat_config.fbank_opts.frame_opts.samp_freq; + while (!c->samples.empty()) { + c->s->AcceptWaveform(sample_rate, c->samples.front()); + c->samples.pop_front(); + } +} + +void OnlineGrpcDecoder::InputFinished(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + + float sample_rate = + config_.recognizer_config.feat_config.fbank_opts.frame_opts.samp_freq; + + while (!c->samples.empty()) { + c->s->AcceptWaveform(sample_rate, c->samples.front()); + c->samples.pop_front(); + } + + // TODO(fangjun): Change the amount of paddings to be configurable + torch::Tensor tail_padding = + torch::zeros({static_cast + (config_.padding_seconds * sample_rate)}).to(torch::kFloat); + + c->s->AcceptWaveform(sample_rate, tail_padding); + + c->s->InputFinished(); +} + +void OnlineGrpcDecoder::Run() { + timer_.expires_after(std::chrono::milliseconds(config_.loop_interval_ms)); + + timer_.async_wait( + [this](const asio::error_code &ec) { ProcessConnections(ec); }); +} + +void OnlineGrpcDecoder::ProcessConnections(const asio::error_code &ec) { + if (ec) { + SHERPA_LOG(FATAL) << "The decoder loop is aborted!"; + } + + std::lock_guard lock(mutex_); + std::vector to_remove; + for (auto &p : connections_) { + auto reqid = p.first; + auto c = p.second; + + // The order of `if` below matters! + if (!server_->Contains(reqid)) { + // If the connection is disconnected, we stop processing it + to_remove.push_back(reqid); + continue; + } + + if (active_.count(reqid)) { + // Another thread is decoding this stream, so skip it + continue; + } + + if (!recognizer_->IsReady(c->s.get())) { + // this stream has not enough frames to decode, so skip it + continue; + } + + // TODO(fangun): If the connection is timed out, we need to also + // add it to `to_remove` + + // this stream has enough frames and is currently not processed by any + // threads, so put it into the ready queue + ready_connections_.push_back(c); + + // In `Decode()`, it will remove hdl from `active_` + active_.insert(reqid); + } + + for (auto reqid_rm : to_remove) { + connections_.erase(reqid_rm); + } + + if (!ready_connections_.empty()) { + asio::post(server_->GetWorkContext(), [this]() { Decode(); }); + } + + // Schedule another call + timer_.expires_after(std::chrono::milliseconds(config_.loop_interval_ms)); + + timer_.async_wait( + [this](const asio::error_code &ec) { ProcessConnections(ec); }); +} + +void OnlineGrpcDecoder::Decode() { + std::unique_lock lock(mutex_); + if (ready_connections_.empty()) { + // There are no connections that are ready for decoding, + // so we return directly + return; + } + + std::vector> c_vec; + std::vector s_vec; + while (!ready_connections_.empty() && + static_cast(s_vec.size()) < config_.max_batch_size) { + auto c = ready_connections_.front(); + ready_connections_.pop_front(); + + c_vec.push_back(c); + s_vec.push_back(c->s.get()); + } + + if (!ready_connections_.empty()) { + // there are too many ready connections but this thread can only handle + // max_batch_size connections at a time, so we schedule another call + // to Decode() and let other threads to process the ready connections + asio::post(server_->GetWorkContext(), [this]() { Decode(); }); + } + + lock.unlock(); + recognizer_->DecodeStreams(s_vec.data(), s_vec.size()); + lock.lock(); + + for (auto c : c_vec) { + auto result = recognizer_->GetResult(c->s.get()); + SerializeResult(c); + if (!result.is_final) { + OnPartialResult(c); + } else { + OnFinalResult(c); + connections_.erase(c->reqid); + OnSpeechEnd(c); + } + SHERPA_LOG(INFO) << "Decode result:" << result.AsJsonString(); + active_.erase(c->reqid); + } +} + +OnlineGrpcServer::OnlineGrpcServer( + asio::io_context &io_work, + const OnlineGrpcServerConfig &config) + : config_(config), + io_work_(io_work), + decoder_(this) {} + +void OnlineGrpcServer::Run() { + decoder_.Run(); +} + +bool OnlineGrpcServer::Contains(const std::string& reqid) const { + std::lock_guard lock(mutex_); + return connections_.count(reqid); +} + +Status OnlineGrpcServer::Recognize(ServerContext* context, + ServerReaderWriter* stream) { + SHERPA_LOG(INFO) << "Get Recognize request"; + std::shared_ptr s = decoder_.recognizer_->CreateStream(); + auto c = std::make_shared ( + std::make_shared>(*stream), + std::make_shared(), + std::make_shared(), + s); + int32_t sleep_cnt = 0; + + float sample_rate = decoder_.config_.recognizer_config. + feat_config.fbank_opts.frame_opts.samp_freq; + + while (stream->Read(c->request.get())) { + if (!c->start_flag) { + c->start_flag = true; + c->reqid = c->request->decode_config().reqid(); + + mutex_.lock(); + connections_.insert(c->reqid); + mutex_.unlock(); + + decoder_.mutex_.lock(); + decoder_.connections_.insert({c->reqid, c}); + decoder_.mutex_.unlock(); + } else { + const int16_t* pcm_data = + reinterpret_cast(c->request->audio_data().c_str()); + int32_t num_samples = + c->request->audio_data().length() / sizeof(int16_t); + SHERPA_LOG(INFO) << c->reqid << "Received " + << num_samples << " samples"; + torch::Tensor samples = torch::from_blob(const_cast(pcm_data), + {num_samples}, + torch::kShort).to(torch::kFloat) / 32768; + c->samples.push_back(samples); + decoder_.AcceptWaveform(c); + } + } + decoder_.InputFinished(c); + + while (!c->finish_flag) { + std::this_thread::sleep_for( + std::chrono::milliseconds(static_cast(SHERPA_SLEEP_TIME))); + if (sleep_cnt++ > SHERPA_SLEEP_ROUND_MAX) { + c->finish_flag = true; + break; + } + } + + mutex_.lock(); + connections_.erase(c->reqid); + mutex_.unlock(); + + SHERPA_LOG(INFO) << "reqid:" << c->reqid << " Connection close"; + return Status::OK; +} +} // namespace sherpa diff --git a/sherpa/cpp_api/grpc/online-grpc-server-impl.h b/sherpa/cpp_api/grpc/online-grpc-server-impl.h new file mode 100644 index 000000000..b0bbfe21b --- /dev/null +++ b/sherpa/cpp_api/grpc/online-grpc-server-impl.h @@ -0,0 +1,154 @@ +// sherpa/cpp_api/websocket/online-grpc-server-impl.h +// +// Copyright (c) 2022 Xiaomi Corporation +// 2023 y00281951 + +#ifndef SHERPA_CPP_API_GRPC_ONLINE_GRPC_SERVER_IMPL_H_ +#define SHERPA_CPP_API_GRPC_ONLINE_GRPC_SERVER_IMPL_H_ + +#include +#include +#include +#include // NOLINT +#include +#include +#include + +#include "asio.hpp" +#include "sherpa/cpp_api/online-recognizer.h" +#include "sherpa/cpp_api/online-stream.h" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/cpp_api/grpc/sherpa.grpc.pb.h" + +namespace sherpa { +using grpc::ServerContext; +using grpc::ServerReaderWriter; +using grpc::Status; + +struct Connection { + // handle to the connection. We can use it to send messages to the client + std::string reqid; + std::shared_ptr> stream; + std::shared_ptr request; + std::shared_ptr response; + std::shared_ptr s; + + // The last time we received a message from the client + // TODO(fangjun): Use it to disconnect from a client if it is inactive + // for a specified time. + std::chrono::steady_clock::time_point last_active; + + std::mutex mutex; // protect sampels + + // Audio samples received from the client. + // + // The I/O threads receive audio samples into this queue + // and invoke work threads to compute features + std::deque samples; + + bool start_flag = false; // first time read request flag + bool finish_flag = false; // connection finish flag + + Connection() = default; + Connection(std::shared_ptr> stream, + std::shared_ptr request, + std::shared_ptr response, + std::shared_ptr s) + : stream(stream), + request(request), + response(response), + s(s), + last_active(std::chrono::steady_clock::now()) {} +}; + +struct OnlineGrpcDecoderConfig { + OnlineRecognizerConfig recognizer_config; + + // It determines how often the decoder loop runs. + int32_t loop_interval_ms = 10; + + int32_t max_batch_size = 5; + + float padding_seconds = 0.8; + + void Register(ParseOptions *po); + void Validate() const; +}; + +class OnlineGrpcServer; + +class OnlineGrpcDecoder { + public: + /** + * @param server Not owned. + */ + explicit OnlineGrpcDecoder(OnlineGrpcServer *server); + + // Compute features for a stream given audio samples + void AcceptWaveform(std::shared_ptr c); + + // signal that there will be no more audio samples for a stream + void InputFinished(std::shared_ptr c); + + void Run(); + + OnlineGrpcDecoderConfig config_; + std::map> connections_; + std::unique_ptr recognizer_; + // It protects `connections_`, `ready_connections_`, and `active_` + std::mutex mutex_; + + private: + void ProcessConnections(const asio::error_code &ec); + void SerializeResult(std::shared_ptr c); + void OnPartialResult(std::shared_ptr c); + void OnFinalResult(std::shared_ptr c); + void OnSpeechEnd(std::shared_ptr c); + /** It is called by one of the worker thread. + */ + void Decode(); + + private: + OnlineGrpcServer *server_; // not owned + asio::steady_timer timer_; + + // Whenever a connection has enough feature frames for decoding, we put + // it in this queue + std::deque> ready_connections_; + + // If we are decoding a stream, we put it in the active_ set so that + // only one thread can decode a stream at a time. + std::set active_; +}; + +struct OnlineGrpcServerConfig { + OnlineGrpcDecoderConfig decoder_config; + + void Register(sherpa::ParseOptions *po); + void Validate() const; +}; + +class OnlineGrpcServer final : public ASR::Service { + public: + OnlineGrpcServer(asio::io_context &io_work, // NOLINT + const OnlineGrpcServerConfig &config); + Status Recognize(ServerContext* context, + ServerReaderWriter* reader) override; + void Run(); + + const OnlineGrpcServerConfig &GetConfig() const { return config_; } + bool Contains(const std::string& reqid) const; + asio::io_context &GetWorkContext() { return io_work_; } + std::set connections_; + + private: + OnlineGrpcServerConfig config_; + asio::io_context &io_work_; + OnlineGrpcDecoder decoder_; + + mutable std::mutex mutex_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_GRPC_ONLINE_GRPC_SERVER_IMPL_H_ diff --git a/sherpa/cpp_api/grpc/online-grpc-server.cc b/sherpa/cpp_api/grpc/online-grpc-server.cc new file mode 100644 index 000000000..711f5e6ad --- /dev/null +++ b/sherpa/cpp_api/grpc/online-grpc-server.cc @@ -0,0 +1,108 @@ +// sherpa/cpp_api/grpc/online-grpc-server.cc +// Copyright (c) 2022 Xiaomi Corporation +// 2023 y00281951 + +#include "asio.hpp" +#include "grpcpp/ext/proto_server_reflection_plugin.h" +#include "grpcpp/grpcpp.h" +#include "grpcpp/health_check_service_interface.h" +#include "sherpa/cpp_api/grpc/online-grpc-server-impl.h" +#include "sherpa/csrc/log.h" +#include "torch/all.h" + +using grpc::Server; +using grpc::ServerBuilder; + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using grpc. + +Usage: + +sherpa-online-grpc-server --help + +sherpa-online-grpc-server \ + --use-gpu=false \ + --port=6006 \ + --num-work-threads=5 \ + --nn-model=/path/to/cpu.jit \ + --tokens=/path/to/tokens.txt \ + --decoding-method=greedy_search \ + --log-file=./log.txt +)"; + +int32_t main(int32_t argc, char *argv[]) { + torch::set_num_threads(1); + torch::set_num_interop_threads(1); + sherpa::InferenceMode no_grad; + + torch::jit::getExecutorMode() = false; + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(false); + + sherpa::ParseOptions po(kUsageMessage); + + sherpa::OnlineGrpcServerConfig config; + + // the server will listen on this port, for both grpc and http + int32_t port = 6006; + + // size of the thread pool for neural network computation and decoding + int32_t num_work_threads = 5; + + int32_t num_workers = 1; + + po.Register("num-work-threads", &num_work_threads, + "Number of threads to use for neural network " + "computation and decoding."); + + po.Register("port", &port, "The port on which the server will listen."); + + config.Register(&po); + + if (argc == 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + po.Read(argc, argv); + + if (po.NumArgs() != 0) { + SHERPA_LOG(ERROR) << "Unrecognized positional arguments!"; + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + config.Validate(); + + asio::io_context io_work; // for neural network and decoding + + sherpa::OnlineGrpcServer service(io_work, config); + service.Run(); + + SHERPA_LOG(INFO) << "Number of work threads: " << num_work_threads << "\n"; + // give some work to do for the io_work pool + auto work_guard = asio::make_work_guard(io_work); + + std::vector work_threads; + for (int32_t i = 0; i < num_work_threads; ++i) { + work_threads.emplace_back([&io_work]() { io_work.run(); }); + } + + grpc::EnableDefaultHealthCheckService(true); + grpc::reflection::InitProtoReflectionServerBuilderPlugin(); + ServerBuilder builder; + std::string address("0.0.0.0:" + std::to_string(port)); + builder.AddListeningPort(address, grpc::InsecureServerCredentials()); + builder.RegisterService(&service); + builder.SetSyncServerOption(ServerBuilder::SyncServerOption::NUM_CQS, + num_workers); + std::unique_ptr server(builder.BuildAndStart()); + SHERPA_LOG(INFO) << "Listening on: " << port << "\n"; + + for (auto &t : work_threads) { + t.join(); + } + + server->Wait(); + return 0; +} diff --git a/sherpa/cpp_api/grpc/sherpa.proto b/sherpa/cpp_api/grpc/sherpa.proto new file mode 100644 index 000000000..2f2fd69dd --- /dev/null +++ b/sherpa/cpp_api/grpc/sherpa.proto @@ -0,0 +1,65 @@ +// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +syntax = "proto3"; + + +package sherpa; + +service ASR { + rpc Recognize (stream Request) returns (stream Response) {} +} + +message Request { + + message DecodeConfig { + int32 nbest_config = 1; + string reqid = 2; + } + + oneof RequestPayload { + DecodeConfig decode_config = 1; + bytes audio_data = 2; + } +} + +message Response { + + message OneBest { + string sentence = 1; + repeated OnePiece wordpieces = 2; + } + + message OnePiece { + string word = 1; + int32 start = 2; + int32 end = 3; + } + + enum Status { + ok = 0; + failed = 1; + } + + enum Type { + server_ready = 0; + partial_result = 1; + final_result = 2; + speech_end = 3; + } + + Status status = 1; + Type type = 2; + repeated OneBest nbest = 3; +} + diff --git a/sherpa/python/csrc/rnnt_beam_search.h b/sherpa/cpp_api/macros.h similarity index 64% rename from sherpa/python/csrc/rnnt_beam_search.h rename to sherpa/cpp_api/macros.h index fc24f8da3..1e6c65e5c 100644 --- a/sherpa/python/csrc/rnnt_beam_search.h +++ b/sherpa/cpp_api/macros.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) * * See LICENSE for clarification regarding multiple authors * @@ -15,15 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef SHERPA_PYTHON_CSRC_RNNT_BEAM_SEARCH_H_ -#define SHERPA_PYTHON_CSRC_RNNT_BEAM_SEARCH_H_ -#include "sherpa/python/csrc/sherpa.h" +#ifndef SHERPA_CPP_API_MACROS_H_ +#define SHERPA_CPP_API_MACROS_H_ namespace sherpa { -void PybindRnntBeamSearch(py::module &m); // NOLINT +#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ + (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR >= 9) +using InferenceMode = torch::InferenceMode; +#else +using InferenceMode = torch::NoGradGuard; +#endif } // namespace sherpa -#endif // SHERPA_PYTHON_CSRC_RNNT_BEAM_SEARCH_H_ +#endif // SHERPA_CPP_API_MACROS_H_ diff --git a/sherpa/cpp_api/offline-recognizer-ctc-impl.h b/sherpa/cpp_api/offline-recognizer-ctc-impl.h new file mode 100644 index 000000000..8c9351dee --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer-ctc-impl.h @@ -0,0 +1,210 @@ +// sherpa/cpp_api/offline-recognizer-ctc-impl.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_CTC_IMPL_H_ +#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_CTC_IMPL_H_ + +#include +#include +#include +#include + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/offline-recognizer-impl.h" +#include "sherpa/csrc/log.h" +#include "sherpa/csrc/offline-conformer-ctc-model.h" +#include "sherpa/csrc/offline-ctc-decoder.h" +#include "sherpa/csrc/offline-ctc-model.h" +#include "sherpa/csrc/offline-ctc-one-best-decoder.h" +#include "sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.h" +#include "sherpa/csrc/offline-wav2vec2-ctc-model.h" +#include "sherpa/csrc/offline-wenet-conformer-ctc-model.h" +#include "sherpa/csrc/symbol-table.h" + +namespace sherpa { + +static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, + const SymbolTable &sym_table, + int32_t frame_shift_ms, + int32_t subsampling_factor) { + OfflineRecognitionResult r; + r.tokens.reserve(src.tokens.size()); + r.timestamps.reserve(src.timestamps.size()); + + std::string text; + for (auto i : src.tokens) { + auto sym = sym_table[i]; + text.append(sym); + + r.tokens.push_back(std::move(sym)); + } + r.text = std::move(text); + + float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor; + for (auto t : src.timestamps) { + float time = frame_shift_s * t; + r.timestamps.push_back(time); + } + + return r; +} + +class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { + public: + explicit OfflineRecognizerCtcImpl(const OfflineRecognizerConfig &config) + : config_(config), + symbol_table_(config.tokens), + fbank_(config.feat_config.fbank_opts), + device_(torch::kCPU) { + config.ctc_decoder_config.Validate(); + + if (config.use_gpu) { + device_ = torch::Device("cuda:0"); + } + + torch::jit::Module m = torch::jit::load(config.nn_model, torch::kCPU); + // We currently support: icefall, wenet, torchaudio. + std::string class_name = m.type()->name()->name(); + if (class_name == "ASRModel") { + // this one is from wenet, see + // https://github.com/wenet-e2e/wenet/blob/main/wenet/transformer/asr_model.py#L42 + model_ = std::make_unique(config.nn_model, + device_); + } else if (class_name == "Conformer") { + // this one is from icefall, see + // https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conformer_ctc/conformer.py#L27 + model_ = + std::make_unique(config.nn_model, device_); + } else if (class_name == "Wav2Vec2Model") { + // This one is from torchaudio + // https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/model.py#L11 + model_ = + std::make_unique(config.nn_model, device_); + config_.feat_config.return_waveform = true; + symbol_table_.Replace(symbol_table_["|"], " ", "|"); + // See Section 4.2 of + // https://arxiv.org/pdf/2006.11477.pdf + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms = 20; + SHERPA_LOG(WARNING) << "Set frame_shift_ms to 20 for wav2vec 2.0"; + } else if (class_name == "EncDecCTCModelBPE") { + // This one is from NeMo + // See + // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/ctc_bpe_models.py#L34 + // + model_ = std::make_unique(config.nn_model, + device_); + } else if (class_name == "EncDecCTCModel") { + // This one is from NeMo + // See + // https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/ctc_models.py#L41 + // + model_ = + std::make_unique(config.nn_model, device_); + } else { + std::ostringstream os; + os << "Support only models from icefall, wenet, torchaudio, and NeMo\n" + "https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/" + "ASR/" + "conformer_ctc/conformer.py#L27" + "\n" + "https://github.com/wenet-e2e/wenet/blob/main/wenet/transformer/" + "asr_model.py#L42" + "\n" + "https://github.com/pytorch/audio/blob/main/torchaudio/models/" + "wav2vec2/model.py#L11" + "\n" + "https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/" + "models/ctc_bpe_models.py#L34" + "\n" + "https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/" + "models/ctc_models.py#L41" + << "\n" + << "Given: " << class_name << "\n"; + + TORCH_CHECK(false, os.str()); + } + + WarmUp(); + + decoder_ = std::make_unique( + config.ctc_decoder_config, device_, model_->VocabSize()); + } + + std::unique_ptr CreateStream() override { + return std::make_unique(&fbank_, config_.feat_config); + } + + void DecodeStreams(OfflineStream **ss, int32_t n) override { + InferenceMode no_grad; + + std::vector features_vec(n); + std::vector features_length_vec(n); + for (int32_t i = 0; i != n; ++i) { + const auto &f = ss[i]->GetFeatures(); + features_vec[i] = f; + features_length_vec[i] = f.size(0); + } + + // If return_waveform is false, features_vec contains 2-D tensors of shape + // (num_frames, feature_dim). In this case, we should use the padding + // value -23. + // + // If return_waveform is true, features_vec contains 1-D tensors of shape + // (num_samples,). In this case, we use 0 as the padding value. + auto features = torch::nn::utils::rnn::pad_sequence( + features_vec, /*batch_first*/ true, + /*padding_value*/ return_waveform_ ? 0 : -23.025850929940457f); + + auto features_length = torch::tensor(features_length_vec); + + torch::IValue ivalue = model_->Forward(features, features_length); + torch::Tensor log_prob = model_->GetLogSoftmaxOut(ivalue); + torch::Tensor log_prob_len = model_->GetLogSoftmaxOutLength(ivalue); + if (!log_prob_len.defined()) { + log_prob_len = + torch::floor_divide(features_length, model_->SubsamplingFactor()); + log_prob_len = log_prob_len.to(log_prob.device()); + } + + auto results = + decoder_->Decode(log_prob, log_prob_len, model_->SubsamplingFactor()); + for (int32_t i = 0; i != n; ++i) { + ss[i]->SetResult( + Convert(results[i], symbol_table_, + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms, + model_->SubsamplingFactor())); + } + } + + private: + void WarmUp() { + SHERPA_LOG(INFO) << "WarmUp begins"; + auto s = CreateStream(); + float sample_rate = fbank_.GetFrameOptions().samp_freq; + std::vector samples(2 * sample_rate, 0); + s->AcceptSamples(samples.data(), samples.size()); + auto features = s->GetFeatures(); + auto features_length = torch::tensor({features.size(0)}); + features = features.unsqueeze(0); + + features = features.to(device_); + features_length = features_length.to(device_); + + model_->WarmUp(features, features_length); + SHERPA_LOG(INFO) << "WarmUp ended"; + } + + private: + OfflineRecognizerConfig config_; + SymbolTable symbol_table_; + std::unique_ptr model_; + std::unique_ptr decoder_; + kaldifeat::Fbank fbank_; + torch::Device device_; + bool return_waveform_ = false; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_CTC_IMPL_H_ diff --git a/sherpa/cpp_api/offline-recognizer-impl.h b/sherpa/cpp_api/offline-recognizer-impl.h new file mode 100644 index 000000000..f828c679c --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer-impl.h @@ -0,0 +1,33 @@ +// sherpa/cpp_api/offline-recognizer.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_IMPL_H_ +#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_IMPL_H_ + +#include +#include + +#include "sherpa/cpp_api/offline-recognizer.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +class OfflineRecognizerImpl { + public: + virtual ~OfflineRecognizerImpl() = default; + + virtual std::unique_ptr CreateStream() = 0; + + virtual std::unique_ptr CreateStream( + const std::vector> &context_list) { + SHERPA_LOG(FATAL) << "Only transducer models support contextual biasing."; + return nullptr; // just to make compiler happy + } + + virtual void DecodeStreams(OfflineStream **ss, int32_t n) = 0; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_IMPL_H_ diff --git a/sherpa/cpp_api/offline-recognizer-sense-voice-impl.h b/sherpa/cpp_api/offline-recognizer-sense-voice-impl.h new file mode 100644 index 000000000..994dee4e0 --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer-sense-voice-impl.h @@ -0,0 +1,203 @@ +// sherpa/cpp_api/offline-recognizer-sense-voice-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_SENSE_VOICE_IMPL_H_ +#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_SENSE_VOICE_IMPL_H_ +#include +#include +#include +#include + +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/offline-ctc-decoder.h" +#include "sherpa/csrc/offline-ctc-greedy-search-decoder.h" +#include "sherpa/csrc/offline-sense-voice-model.h" +#include "sherpa/csrc/symbol-table.h" + +namespace sherpa { + +static OfflineRecognitionResult ConvertSenseVoice( + const OfflineCtcDecoderResult &src, const SymbolTable &sym_table, + int32_t frame_shift_ms, int32_t subsampling_factor) { + OfflineRecognitionResult r; + r.tokens.reserve(src.tokens.size()); + r.timestamps.reserve(src.timestamps.size()); + + std::string text; + int32_t k = 0; + for (auto i : src.tokens) { + k += 1; + if (k <= 4) { + // skip <|en|><|NEUTRAL|><|Speech|><|woitn|> + continue; + } + auto sym = sym_table[i]; + text.append(sym); + + r.tokens.push_back(std::move(sym)); + } + r.text = std::move(text); + + float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor; + for (auto t : src.timestamps) { + float time = frame_shift_s * t; + r.timestamps.push_back(time); + } + + return r; +} + +class OfflineRecognizerSenseVoiceImpl : public OfflineRecognizerImpl { + public: + explicit OfflineRecognizerSenseVoiceImpl( + const OfflineRecognizerConfig &config) + : config_(config), symbol_table_(config.model.tokens) { + config.ctc_decoder_config.Validate(); + + model_ = std::make_unique(config.model); + + config_.feat_config.fbank_opts.mel_opts.num_bins = 80; + config_.feat_config.normalize_samples = + model_->GetModelMetadata().normalize_samples; + fbank_ = std::make_unique(config_.feat_config.fbank_opts); + + decoder_ = std::make_unique(); + + WarmUp(); + } + + std::unique_ptr CreateStream() override { + return std::make_unique(fbank_.get(), config_.feat_config); + } + + void DecodeStreams(OfflineStream **ss, int32_t n) override { + InferenceMode no_grad; + + std::vector features_vec(n); + std::vector features_length_vec(n); + for (int32_t i = 0; i != n; ++i) { + auto f = ss[i]->GetFeatures(); + f = ApplyLFR(f); + f = ApplyCMVN(f); + features_vec[i] = f; + features_length_vec[i] = f.size(0); + } + + auto device = model_->Device(); + + // If return_waveform is true, features_vec contains 1-D tensors of shape + // (num_samples,). In this case, we use 0 as the padding value. + auto features = + torch::nn::utils::rnn::pad_sequence(features_vec, /*batch_first*/ true, + /*padding_value*/ 0) + .to(device); + + auto features_length = torch::tensor(features_length_vec).to(device); + + /* + {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13} + self.textnorm_dict = {"withitn": 14, "woitn": 15} + */ + + const auto &meta_data = model_->GetModelMetadata(); + int32_t language_id = meta_data.lang2id.at("auto"); + if (meta_data.lang2id.count(config_.model.sense_voice.language)) { + language_id = meta_data.lang2id.at(config_.model.sense_voice.language); + } + std::vector language(n, language_id); + + std::vector use_itn(n, config_.model.sense_voice.use_itn + ? meta_data.with_itn_id + : meta_data.without_itn_id); + + auto language_tensor = torch::tensor(language, torch::kInt).to(device); + auto use_itn_tensor = torch::tensor(use_itn, torch::kInt).to(device); + + auto outputs = model_->RunForward(features, features_length, + language_tensor, use_itn_tensor); + + auto logits = outputs.first; + auto logits_length = outputs.second; + + auto results = decoder_->Decode(logits, logits_length); + + for (int32_t i = 0; i != n; ++i) { + ss[i]->SetResult(ConvertSenseVoice( + results[i], symbol_table_, + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms, + meta_data.window_shift)); + } + } + + private: + void WarmUp() { + SHERPA_LOG(INFO) << "WarmUp begins"; + auto s = CreateStream(); + float sample_rate = fbank_->GetFrameOptions().samp_freq; + std::vector samples(2 * sample_rate, 0); + s->AcceptSamples(samples.data(), samples.size()); + auto features = s->GetFeatures(); + features = ApplyLFR(features); + features = ApplyCMVN(features); + auto features_length = torch::tensor({features.size(0)}); + features = features.unsqueeze(0); + + auto device = model_->Device(); + + features = features.to(device); + features_length = features_length.to(device); + + const auto &meta_data = model_->GetModelMetadata(); + int32_t language_id = meta_data.lang2id.at("auto"); + + std::vector language(1, language_id); + + std::vector use_itn(1, config_.model.sense_voice.use_itn + ? meta_data.with_itn_id + : meta_data.without_itn_id); + + auto language_tensor = torch::tensor(language, torch::kInt).to(device); + auto use_itn_tensor = torch::tensor(use_itn, torch::kInt).to(device); + + auto outputs = model_->RunForward(features, features_length, + language_tensor, use_itn_tensor); + + SHERPA_LOG(INFO) << "WarmUp ended"; + } + + torch::Tensor ApplyLFR(torch::Tensor features) const { + const auto &meta_data = model_->GetModelMetadata(); + + int32_t lfr_window_size = meta_data.window_size; + int32_t lfr_window_shift = meta_data.window_shift; + + int32_t num_frames = features.size(0); + int32_t feat_dim = features.size(1); + + int32_t new_num_frames = + (num_frames - lfr_window_size) / lfr_window_shift + 1; + + int32_t new_feat_dim = feat_dim * lfr_window_size; + + return features + .as_strided({new_num_frames, new_feat_dim}, + {lfr_window_shift * feat_dim, 1}) + .clone(); + } + + torch::Tensor ApplyCMVN(torch::Tensor features) const { + const auto &meta_data = model_->GetModelMetadata(); + + return (features + meta_data.neg_mean) * meta_data.inv_stddev; + } + + private: + OfflineRecognizerConfig config_; + SymbolTable symbol_table_; + std::unique_ptr fbank_; + std::unique_ptr decoder_; + std::unique_ptr model_; +}; +} // namespace sherpa +#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_SENSE_VOICE_IMPL_H_ diff --git a/sherpa/cpp_api/offline-recognizer-transducer-impl.h b/sherpa/cpp_api/offline-recognizer-transducer-impl.h new file mode 100644 index 000000000..307004aea --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer-transducer-impl.h @@ -0,0 +1,178 @@ +// sherpa/cpp_api/offline-recognizer-transducer-impl.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ +#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ + +#include +#include +#include +#include + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/offline-recognizer-impl.h" +#include "sherpa/csrc/byte_util.h" +#include "sherpa/csrc/context-graph.h" +#include "sherpa/csrc/offline-conformer-transducer-model.h" +#include "sherpa/csrc/offline-transducer-decoder.h" +#include "sherpa/csrc/offline-transducer-fast-beam-search-decoder.h" +#include "sherpa/csrc/offline-transducer-greedy-search-decoder.h" +#include "sherpa/csrc/offline-transducer-model.h" +#include "sherpa/csrc/offline-transducer-modified-beam-search-decoder.h" +#include "sherpa/csrc/symbol-table.h" + +namespace sherpa { + +static OfflineRecognitionResult Convert( + const OfflineTransducerDecoderResult &src, const SymbolTable &sym_table, + int32_t frame_shift_ms, int32_t subsampling_factor, bool use_bbpe) { + OfflineRecognitionResult r; + r.tokens.reserve(src.tokens.size()); + r.timestamps.reserve(src.timestamps.size()); + + std::string text; + for (auto i : src.tokens) { + auto sym = sym_table[i]; + text.append(sym); + + r.tokens.push_back(std::move(sym)); + } + + if (use_bbpe) { + auto bu = GetByteUtil(); + text = bu->Decode(text); + } + + r.text = std::move(text); + + float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor; + for (auto t : src.timestamps) { + float time = frame_shift_s * t; + r.timestamps.push_back(time); + } + + return r; +} + +class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { + public: + explicit OfflineRecognizerTransducerImpl( + const OfflineRecognizerConfig &config) + : config_(config), + symbol_table_(config.tokens), + fbank_(config.feat_config.fbank_opts), + device_(torch::kCPU) { + if (config.use_gpu) { + device_ = torch::Device("cuda:0"); + } + model_ = std::make_unique(config.nn_model, + device_); + + WarmUp(); + + if (config.decoding_method == "greedy_search") { + decoder_ = + std::make_unique(model_.get()); + } else if (config.decoding_method == "modified_beam_search") { + decoder_ = std::make_unique( + model_.get(), config.num_active_paths, config.temperature); + } else if (config.decoding_method == "fast_beam_search") { + config.fast_beam_search_config.Validate(); + + decoder_ = std::make_unique( + model_.get(), config.fast_beam_search_config); + } else { + TORCH_CHECK(false, + "Unsupported decoding method: ", config.decoding_method); + } + } + + std::unique_ptr CreateStream() override { + return std::make_unique(&fbank_, config_.feat_config); + } + + std::unique_ptr CreateStream( + const std::vector> &context_list) override { + // We create context_graph at this level, because we might have default + // context_graph(will be added later if needed) that belongs to the whole + // model rather than each stream. + auto context_graph = + std::make_shared(context_list, config_.context_score); + return std::make_unique(&fbank_, config_.feat_config, + context_graph); + } + + void DecodeStreams(OfflineStream **ss, int32_t n) override { + InferenceMode no_grad; + + bool has_context_graph = false; + std::vector features_vec(n); + std::vector features_length_vec(n); + for (int32_t i = 0; i != n; ++i) { + if (!has_context_graph && ss[i]->GetContextGraph()) + has_context_graph = true; + const auto &f = ss[i]->GetFeatures(); + features_vec[i] = f; + features_length_vec[i] = f.size(0); + } + + auto features = torch::nn::utils::rnn::pad_sequence( + features_vec, /*batch_first*/ true, + /*padding_value*/ -23.025850929940457f) + .to(device_); + + auto features_length = torch::tensor(features_length_vec).to(device_); + + torch::Tensor encoder_out; + torch::Tensor encoder_out_length; + + std::tie(encoder_out, encoder_out_length) = + model_->RunEncoder(features, features_length); + encoder_out_length = encoder_out_length.cpu(); + + OfflineStream **streams = has_context_graph ? ss : nullptr; + int32_t num_streams = has_context_graph ? n : 0; + auto results = + decoder_->Decode(encoder_out, encoder_out_length, streams, num_streams); + + for (int32_t i = 0; i != n; ++i) { + auto ans = + Convert(results[i], symbol_table_, + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms, + model_->SubsamplingFactor(), config_.use_bbpe); + + ss[i]->SetResult(ans); + } + } + + private: + void WarmUp() { + SHERPA_LOG(INFO) << "WarmUp begins"; + auto s = CreateStream(); + float sample_rate = fbank_.GetFrameOptions().samp_freq; + std::vector samples(2 * sample_rate, 0); + s->AcceptSamples(samples.data(), samples.size()); + auto features = s->GetFeatures(); + auto features_length = torch::tensor({features.size(0)}); + features = features.unsqueeze(0); + + features = features.to(device_); + features_length = features_length.to(device_); + + model_->WarmUp(features, features_length); + SHERPA_LOG(INFO) << "WarmUp ended"; + } + + private: + OfflineRecognizerConfig config_; + SymbolTable symbol_table_; + std::unique_ptr model_; + std::unique_ptr decoder_; + kaldifeat::Fbank fbank_; + torch::Device device_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ diff --git a/sherpa/cpp_api/offline-recognizer-whisper-impl.h b/sherpa/cpp_api/offline-recognizer-whisper-impl.h new file mode 100644 index 000000000..c44646f31 --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer-whisper-impl.h @@ -0,0 +1,363 @@ +// sherpa/cpp_api/offline-recognizer-whisper-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_WHISPER_IMPL_H_ +#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_WHISPER_IMPL_H_ + +#include +#include +#include +#include + +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/offline-whisper-model.h" +#include "sherpa/csrc/symbol-table.h" + +namespace sherpa { + +static OfflineRecognitionResult Convert(const std::vector &tokens, + const SymbolTable &sym_table) { + OfflineRecognitionResult r; + r.tokens.reserve(tokens.size()); + + std::string text; + for (auto i : tokens) { + auto sym = sym_table[i]; + text.append(sym); + + r.tokens.push_back(std::move(sym)); + } + r.text = std::move(text); + + return r; +} + +class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { + public: + explicit OfflineRecognizerWhisperImpl(const OfflineRecognizerConfig &config) + : config_(config), symbol_table_(config.model.tokens) { + symbol_table_.ApplyBase64Decode(); + + model_ = std::make_unique(config.model); + + config_.feat_config.normalize_samples = true; + + auto whisper_opts = kaldifeat::WhisperFbankOptions(); + whisper_opts.num_mels = model_->GetModelMetadata().n_mels; + + whisper_ = std::make_unique(whisper_opts); + } + + std::unique_ptr CreateStream() override { + return std::make_unique(whisper_.get(), config_.feat_config); + } + + void DecodeStreams(OfflineStream **ss, int32_t n) override { + InferenceMode no_grad; + if (n == 1) { + DecodeStream(ss[0]); + return; + } + + auto device = model_->Device(); + +#if 0 + // TODO(fangjun): Figure out why this branch does not work. + // All wave files are decoded into the same result like the first wave file + std::vector features_vec(n); + for (int32_t i = 0; i != n; ++i) { + auto features = ss[i]->GetFeatures(); + features = PadOrTrimFeatures(features); + features_vec[i] = PadOrTrimFeatures(features); + } + + auto features = torch::stack(features_vec, 0).to(device).permute({0, 2, 1}); + + torch::Tensor n_layer_cross_k_cache; + torch::Tensor n_layer_cross_v_cache; + + std::tie(n_layer_cross_k_cache, n_layer_cross_v_cache) = + model_->RunEncoder(features); +#else + std::vector n_layer_cross_k_cache_list; + std::vector n_layer_cross_v_cache_list; + + for (int32_t i = 0; i != n; ++i) { + auto features = ss[i]->GetFeatures(); + features = PadOrTrimFeatures(features).to(device).t().unsqueeze(0); + + torch::Tensor n_layer_cross_k_cache; + torch::Tensor n_layer_cross_v_cache; + + std::tie(n_layer_cross_k_cache, n_layer_cross_v_cache) = + model_->RunEncoder(features); + n_layer_cross_k_cache_list.push_back(n_layer_cross_k_cache); + n_layer_cross_v_cache_list.push_back(n_layer_cross_v_cache); + } + + torch::Tensor n_layer_cross_k_cache = + torch::cat(n_layer_cross_k_cache_list, 1); + torch::Tensor n_layer_cross_v_cache = + torch::cat(n_layer_cross_v_cache_list, 1); +#endif + + auto meta_data = model_->GetModelMetadata(); + auto sot_sequence = meta_data.sot_sequence; + sot_sequence.push_back(meta_data.no_timestamps); + torch::Tensor tokens = + torch::tensor(sot_sequence, torch::dtype(torch::kLong).device(device)) + .reshape({1, -1}) + .repeat({n, 1}); + + if (meta_data.is_multilingual) { + // sot_sequence: [sot, language, task, notimestamp] + auto language = config_.model.whisper.language; + if (!language.empty()) { + if (!meta_data.lang2id.count(language)) { + SHERPA_LOG(FATAL) << "language '" << language << " is not valid"; + } + tokens.index_put_({"...", 1}, meta_data.lang2id.at(language)); + } else { + if (config_.model.debug) { + SHERPA_LOGE("Begin to detect language"); + } + auto detected_language = model_->DetectLanguage(n_layer_cross_k_cache, + n_layer_cross_v_cache); + tokens.index_put_({"...", 1}, detected_language); + + if (config_.model.debug) { + detected_language = detected_language.cpu(); + auto acc = detected_language.accessor(); + for (int32_t i = 0; i != n; ++i) { + SHERPA_LOGE("Wave %d: detected language: %s", i, + meta_data.id2lang.at(acc[i]).c_str()); + } + } + } + + if (config_.model.whisper.task == "translate") { + tokens.index_put_({"...", 2}, meta_data.translate); + } + } + + torch::Tensor logits; + + torch::Tensor n_layer_self_k_cache = + torch::zeros({meta_data.n_text_layer, n, meta_data.n_text_ctx, + meta_data.n_text_state}, + torch::dtype(torch::kFloat).device(device)); + + torch::Tensor n_layer_self_v_cache = + torch::zeros({meta_data.n_text_layer, n, meta_data.n_text_ctx, + meta_data.n_text_state}, + torch::dtype(torch::kFloat).device(device)); + + torch::Tensor offset = + torch::zeros({n}, torch::dtype(torch::kInt).device(device)); + + std::tie(logits, n_layer_self_k_cache, n_layer_self_v_cache) = + model_->RunDecoder(tokens, n_layer_self_k_cache, n_layer_self_v_cache, + n_layer_cross_k_cache, n_layer_cross_v_cache, + offset); + + torch::Tensor eot = torch::tensor( + {meta_data.eot}, torch::dtype(torch::kLong).device(device)); + + torch::Tensor results = + torch::full({n, meta_data.n_text_ctx}, meta_data.eot, + torch::dtype(torch::kLong).device(device)); + + torch::Tensor num_decoded_tokens = + torch::zeros({n}, torch::dtype(torch::kLong).device(device)); + + torch::Tensor new2old = + torch::arange(n, torch::dtype(torch::kLong).device(device)); + + for (int32_t i = 0; i < meta_data.n_text_ctx; ++i) { + tokens = logits.slice(1, -1).argmax(-1); + torch::Tensor eot_indexes = (tokens.squeeze() == eot).nonzero().squeeze(); + + if (eot_indexes.numel()) { + num_decoded_tokens.index_put_( + {"...", new2old.index_select(0, eot_indexes)}, i); + + if (eot_indexes.numel() == tokens.size(0)) { + break; + } + + torch::Tensor non_eot_indexes = + (tokens.squeeze() != eot).nonzero().squeeze(); + + tokens = tokens.index_select(0, non_eot_indexes); + + offset = offset.index_select(0, non_eot_indexes); + new2old = new2old.index_select(0, non_eot_indexes); + n_layer_cross_k_cache = + n_layer_cross_k_cache.index_select(1, non_eot_indexes); + n_layer_cross_v_cache = + n_layer_cross_v_cache.index_select(1, non_eot_indexes); + n_layer_self_k_cache = + n_layer_self_k_cache.index_select(1, non_eot_indexes); + n_layer_self_v_cache = + n_layer_self_v_cache.index_select(1, non_eot_indexes); + } + + results.index_put_({new2old, i}, tokens.squeeze()); + offset.add_(logits.size(1)); + + std::tie(logits, n_layer_self_k_cache, n_layer_self_v_cache) = + model_->RunDecoder(tokens, n_layer_self_k_cache, n_layer_self_v_cache, + n_layer_cross_k_cache, n_layer_cross_v_cache, + offset); + } + num_decoded_tokens = num_decoded_tokens.cpu(); + auto acc = num_decoded_tokens.accessor(); + results = results.cpu(); + auto p = results.data_ptr(); + for (int32_t i = 0; i != n; ++i) { + auto token_ids = std::vector{p + i * results.size(1), + p + i * results.size(1) + acc[i]}; + + ss[i]->SetResult(Convert(token_ids, symbol_table_)); + } + } + + private: + void DecodeStream(OfflineStream *s) { + auto device = model_->Device(); + + torch::Tensor features = s->GetFeatures(); + features = PadOrTrimFeatures(features); + features = features.t().unsqueeze(0).to(device); + + torch::Tensor n_layer_cross_k_cache; + torch::Tensor n_layer_cross_v_cache; + + std::tie(n_layer_cross_k_cache, n_layer_cross_v_cache) = + model_->RunEncoder(features); + + auto meta_data = model_->GetModelMetadata(); + auto sot_sequence = meta_data.sot_sequence; + sot_sequence.push_back(meta_data.no_timestamps); + + if (meta_data.is_multilingual) { + // sot_sequence: [sot, language, task, notimestamp] + auto language = config_.model.whisper.language; + if (!language.empty()) { + if (!meta_data.lang2id.count(language)) { + SHERPA_LOG(FATAL) << "language '" << language << " is not valid"; + } + + sot_sequence[1] = meta_data.lang2id.at(language); + } else { + if (config_.model.debug) { + SHERPA_LOGE("Begin to detect language"); + } + sot_sequence[1] = + model_->DetectLanguage(n_layer_cross_k_cache, n_layer_cross_v_cache) + .item() + .toInt(); + if (config_.model.debug) { + SHERPA_LOGE("Detected language: %s", + meta_data.id2lang.at(sot_sequence[1]).c_str()); + } + } + + if (config_.model.whisper.task == "translate") { + sot_sequence[2] = meta_data.translate; + } + } + + torch::Tensor tokens = + torch::from_blob(sot_sequence.data(), + {1, static_cast(sot_sequence.size())}, + torch::kLong) + .to(device); + + torch::Tensor logits; + + torch::Tensor n_layer_self_k_cache = + torch::zeros({meta_data.n_text_layer, 1, meta_data.n_text_ctx, + meta_data.n_text_state}, + torch::dtype(torch::kFloat).device(device)); + + torch::Tensor n_layer_self_v_cache = + torch::zeros({meta_data.n_text_layer, 1, meta_data.n_text_ctx, + meta_data.n_text_state}, + torch::dtype(torch::kFloat).device(device)); + + torch::Tensor offset = + torch::zeros({1}, torch::dtype(torch::kInt).device(device)); + + std::tie(logits, n_layer_self_k_cache, n_layer_self_v_cache) = + model_->RunDecoder(tokens, n_layer_self_k_cache, n_layer_self_v_cache, + n_layer_cross_k_cache, n_layer_cross_v_cache, + offset); + + torch::Tensor eot = torch::tensor( + {meta_data.eot}, torch::dtype(torch::kLong).device(device)); + + torch::Tensor results = + torch::full({1, meta_data.n_text_ctx}, meta_data.eot, + torch::dtype(torch::kLong).device(device)); + + int32_t i; + for (i = 0; i < meta_data.n_text_ctx; ++i) { + tokens = logits.slice(1, -1).argmax(-1); + if ((tokens == eot).sum().item().toInt() == 1) { + break; + } + results.slice(1, i, i + 1) = tokens; + offset.add_(logits.size(1)); + + std::tie(logits, n_layer_self_k_cache, n_layer_self_v_cache) = + model_->RunDecoder(tokens, n_layer_self_k_cache, n_layer_self_v_cache, + n_layer_cross_k_cache, n_layer_cross_v_cache, + offset); + } + results = results.slice(1, 0, i).cpu(); + + std::vector token_ids = { + results.data_ptr(), + results.data_ptr() + results.numel()}; + + s->SetResult(Convert(token_ids, symbol_table_)); + } + + private: + void WarmUp() { + SHERPA_LOG(INFO) << "WarmUp begins"; + + SHERPA_LOG(INFO) << "WarmUp ended"; + } + + torch::Tensor PadOrTrimFeatures(const torch::Tensor &feat) { + auto features = feat; + int32_t target_len = 3000; + int32_t src_len = features.size(0); + if (src_len > target_len) { + SHERPA_LOGE( + "\nInput audio is too long (about %.3f seconds). Only the first %d " + "seconds are used.", + src_len * 0.01, static_cast(target_len * 0.01)); + features = features.slice(0, 0, target_len); + } else if (src_len < target_len) { + int32_t padding = target_len - src_len; + features = torch::nn::functional::pad( + features, torch::nn::functional::PadFuncOptions({0, 0, 0, padding}) + .mode(torch::kConstant) + .value(0)); + } + + return features; + } + + private: + OfflineRecognizerConfig config_; + SymbolTable symbol_table_; + std::unique_ptr whisper_; + std::unique_ptr model_; +}; +} // namespace sherpa +#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_WHISPER_IMPL_H_ diff --git a/sherpa/cpp_api/offline-recognizer.cc b/sherpa/cpp_api/offline-recognizer.cc new file mode 100644 index 000000000..cab51a3f7 --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer.cc @@ -0,0 +1,229 @@ +// sherpa/cpp_api/offline-recognizer.cc +// +// Copyright (c) 2022-2025 Xiaomi Corporation + +#include "sherpa/cpp_api/offline-recognizer.h" + +#include + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/offline-recognizer-ctc-impl.h" +#include "sherpa/cpp_api/offline-recognizer-impl.h" +#include "sherpa/cpp_api/offline-recognizer-sense-voice-impl.h" +#include "sherpa/cpp_api/offline-recognizer-transducer-impl.h" +#include "sherpa/cpp_api/offline-recognizer-whisper-impl.h" +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/log.h" +#include "torch/script.h" + +namespace sherpa { + +void OfflineCtcDecoderConfig::Register(ParseOptions *po) { + po->Register("modified", &modified, + "Used only for decoding with a CTC topology. " + "true to use a modified CTC topology; useful when " + "vocab_size is large, e.g., > 1000. " + "false to use a standard CTC topology."); + + po->Register("hlg", &hlg, "Used only for decoding with an HLG graph. "); + + po->Register("lm-scale", &lm_scale, + "Used only for decoding with an HLG graph. " + "It specifies the scale for HLG.scores"); + + po->Register("search-beam", &search_beam, + "Used only for CTC decoding. " + "Decoding beam, e.g. 20. Smaller is faster, larger is " + "more exact (less pruning). This is the default value; " + "it may be modified by `min_active_states` and " + "`max_active_states`. "); + + po->Register("output-beam", &output_beam, + "Used only for CTC decoding. " + "Beam to prune output, similar to lattice-beam in Kaldi. " + "Relative to the best path of output. "); + + po->Register("min-active-states", &min_active_states, + "Minimum number of FSA states that are allowed to " + "be active on any given frame for any given " + "intersection/composition task. This is advisory, " + "in that it will try not to have fewer than this " + "number active. Set it to zero if there is no " + "constraint. "); + + po->Register( + "max-active-states", &max_active_states, + "max_activate_states Maximum number of FSA states that are allowed to " + "be active on any given frame for any given " + "intersection/composition task. This is advisory, " + "in that it will try not to exceed that but may " + "not always succeed. You can use a very large " + "number if no constraint is needed. "); +} + +void OfflineCtcDecoderConfig::Validate() const { + if (!hlg.empty()) { + AssertFileExists(hlg); + } + + SHERPA_CHECK_GT(search_beam, 0); + SHERPA_CHECK_GT(output_beam, 0); + SHERPA_CHECK_GE(min_active_states, 0); + SHERPA_CHECK_GE(max_active_states, 0); +} + +std::string OfflineCtcDecoderConfig::ToString() const { + std::ostringstream os; + + os << "OfflineCtcDecoderConfig("; + os << "modified=" << (modified ? "True" : "False") << ", "; + os << "hlg=" << '\"' << hlg << '\"' << ", "; + os << "lm_scale=" << lm_scale << ", "; + os << "search_beam=" << search_beam << ", "; + os << "output_beam=" << output_beam << ", "; + os << "min_active_states=" << min_active_states << ", "; + os << "max_active_states=" << max_active_states << ")"; + + return os.str(); +} + +void OfflineRecognizerConfig::Register(ParseOptions *po) { + ctc_decoder_config.Register(po); + feat_config.Register(po); + fast_beam_search_config.Register(po); + model.Register(po); + + po->Register("nn-model", &nn_model, "Path to the torchscript model"); + + po->Register("tokens", &tokens, "Path to tokens.txt."); + + po->Register("use-gpu", &use_gpu, + "true to use GPU for computation. false to use CPU.\n" + "If true, it uses the first device. You can use the environment " + "variable CUDA_VISIBLE_DEVICES to select which device to use."); + + po->Register("decoding-method", &decoding_method, + "Decoding method to use. Possible values are: greedy_search, " + "modified_beam_search, and fast_beam_search"); + + po->Register("num-active-paths", &num_active_paths, + "Number of active paths for modified_beam_search. " + "Used only when --decoding-method is modified_beam_search"); + po->Register("context-score", &context_score, + "The bonus score for each token in context word/phrase. " + "Used only when decoding_method is modified_beam_search"); + + po->Register("use-bbpe", &use_bbpe, + "true if the model to use is trained with byte level bpe, " + "The byte level bpe modeling unit is mainly used on CJK " + "languages or multilingual datasets, it can further break " + "the multi-byte unicode characters into byte sequence and " + "then train some kind of sub-char bpes."); + + po->Register("temperature", &temperature, + "Softmax temperature,. " + "Used only when decoding_method is modified_beam_search."); +} + +void OfflineRecognizerConfig::Validate() const { + if (tokens.empty()) { + SHERPA_LOG(FATAL) << "Please provide --tokens"; + } + AssertFileExists(tokens); + + if (!model.sense_voice.model.empty() || !model.whisper.model.empty()) { + model.tokens = tokens; + model.use_gpu = use_gpu; + if (!model.Validate()) { + SHERPA_LOG(FATAL) << "Errors in config."; + } + } else if (nn_model.empty()) { + SHERPA_LOG(FATAL) << "Please provide --nn-model"; + } + + if (!nn_model.empty()) { + AssertFileExists(nn_model); + } + + // TODO(fangjun): The following checks about decoding_method are + // used only for transducer models. We should skip it for CTC models + if (decoding_method != "greedy_search" && + decoding_method != "modified_beam_search" && + decoding_method != "fast_beam_search") { + SHERPA_LOG(FATAL) + << "Unsupported decoding method: " << decoding_method + << ". Supported values are: greedy_search, modified_beam_search, " + << "and fast_beam_search."; + } + + // TODO(fangjun): Create a class ModifiedBeamSearchConfig + if (decoding_method == "modified_beam_search") { + SHERPA_CHECK_GT(num_active_paths, 0); + } +} + +std::string OfflineRecognizerConfig::ToString() const { + std::ostringstream os; + + os << "OfflineRecognizerConfig("; + os << "ctc_decoder_config=" << ctc_decoder_config.ToString() << ", "; + os << "feat_config=" << feat_config.ToString() << ", "; + os << "model=" << model.ToString() << ", "; + os << "nn_model=\"" << nn_model << "\", "; + os << "tokens=\"" << tokens << "\", "; + os << "use_gpu=" << (use_gpu ? "True" : "False") << ", "; + os << "decoding_method=\"" << decoding_method << "\", "; + os << "num_active_paths=" << num_active_paths << ", "; + os << "context_score=" << context_score << ", "; + os << "use_bbpe=" << (use_bbpe ? "True" : "False") << ", "; + os << "temperature=" << temperature << ")"; + + return os.str(); +} + +std::ostream &operator<<(std::ostream &os, + const OfflineRecognizerConfig &config) { + os << config.ToString(); + return os; +} + +OfflineRecognizer::~OfflineRecognizer() = default; + +OfflineRecognizer::OfflineRecognizer(const OfflineRecognizerConfig &config) { + if (!config.model.sense_voice.model.empty()) { + impl_ = std::make_unique(config); + return; + } + + if (!config.model.whisper.model.empty()) { + impl_ = std::make_unique(config); + return; + } + + if (!config.nn_model.empty()) { + torch::jit::Module m = torch::jit::load(config.nn_model, torch::kCPU); + if (!m.hasattr("joiner")) { + // CTC models do not have a joint network + impl_ = std::make_unique(config); + return; + } + } + + // default to transducer + impl_ = std::make_unique(config); +} + +std::unique_ptr OfflineRecognizer::CreateStream() { + return impl_->CreateStream(); +} + +std::unique_ptr OfflineRecognizer::CreateStream( + const std::vector> &context_list) { + return impl_->CreateStream(context_list); +} + +void OfflineRecognizer::DecodeStreams(OfflineStream **ss, int32_t n) { + impl_->DecodeStreams(ss, n); +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/offline-recognizer.h b/sherpa/cpp_api/offline-recognizer.h new file mode 100644 index 000000000..57b4fabe2 --- /dev/null +++ b/sherpa/cpp_api/offline-recognizer.h @@ -0,0 +1,129 @@ +// sherpa/cpp_api/offline-recognizer.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_H_ +#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_H_ + +#include +#include +#include + +#include "sherpa/cpp_api/fast-beam-search-config.h" +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/macros.h" +#include "sherpa/cpp_api/offline-stream.h" +#include "sherpa/csrc/offline-model-config.h" + +namespace sherpa { + +struct OfflineCtcDecoderConfig { + // Used only for decoding with a CTC topology + // true to use a modified CTC topology. + // false to use a standard CTC topology. + bool modified = true; + + // Used only for HLG decoding + std::string hlg; + float lm_scale = 1.0; + + float search_beam = 20; + float output_beam = 8; + int32_t min_active_states = 30; + int32_t max_active_states = 10000; + + void Register(ParseOptions *po); + void Validate() const; + std::string ToString() const; +}; + +struct OfflineRecognizerConfig { + /// Used only for CTC decoding. + OfflineCtcDecoderConfig ctc_decoder_config; + + /// Config for the feature extractor + FeatureConfig feat_config; + + FastBeamSearchConfig fast_beam_search_config; + + // TODO(fangjun): We will remmove mutable later + mutable OfflineModelConfig model; + + /// Path to the torchscript model + std::string nn_model; + + /// Path to tokens.txt + std::string tokens; + + /// true to use GPU for neural network computation and decoding. + /// false to use CPU. + /// You can use CUDA_VISIBLE_DEVICES to control which device to use. + /// We always use GPU 0 in the code. This also implies it supports only + /// 1 GPU at present. + /// Note: You have to use a CUDA version of PyTorch in order to use + /// GPU for computation + bool use_gpu = false; + + std::string decoding_method = "greedy_search"; + + /// used only for modified_beam_search + int32_t num_active_paths = 4; + + /// used only for modified_beam_search + float context_score = 1.5; + + // True if the model used is trained with byte level bpe. + bool use_bbpe = false; + + // temperature for the softmax in the joiner + float temperature = 1.0; + + void Register(ParseOptions *po); + + void Validate() const; + + /** A string representation for debugging purpose. */ + std::string ToString() const; +}; + +std::ostream &operator<<(std::ostream &os, + const OfflineRecognizerConfig &config); + +class OfflineRecognizerImpl; + +class OfflineRecognizer { + public: + ~OfflineRecognizer(); + + explicit OfflineRecognizer(const OfflineRecognizerConfig &config); + + /// Create a stream for decoding. + std::unique_ptr CreateStream(); + + /// Create a stream with contextual-biasing lists. + std::unique_ptr CreateStream( + const std::vector> &context_list); + + /** Decode a single stream + * + * @param s The stream to decode. + */ + void DecodeStream(OfflineStream *s) { + OfflineStream *ss[1] = {s}; + DecodeStreams(ss, 1); + } + + /** Decode a list of streams. + * + * @param ss Pointer to an array of streams. + * @param n Size of the input array. + */ + void DecodeStreams(OfflineStream **ss, int32_t n); + + private: + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_H_ diff --git a/sherpa/cpp_api/offline-stream.h b/sherpa/cpp_api/offline-stream.h new file mode 100644 index 000000000..8deec738a --- /dev/null +++ b/sherpa/cpp_api/offline-stream.h @@ -0,0 +1,118 @@ +// sherpa/cpp_api/offline-stream.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CPP_API_OFFLINE_STREAM_H_ +#define SHERPA_CPP_API_OFFLINE_STREAM_H_ + +#include +#include +#include + +#include "kaldifeat/csrc/feature-fbank.h" +#include "kaldifeat/csrc/whisper-fbank.h" +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/csrc/context-graph.h" +#include "torch/script.h" + +namespace sherpa { + +struct OfflineRecognitionResult { + // Recognition results. + // For English, it consists of space separated words. + // For Chinese, it consists of Chinese words without spaces. + std::string text; + + // Decoded results at the token level. + // For instance, for BPE-based models it consists of a list of BPE tokens. + std::vector tokens; + + /// timestamps.size() == tokens.size() + /// timestamps[i] records the time in seconds when tokens[i] is decoded. + std::vector timestamps; + + /** Return a json string. + * + * The returned string contains: + * { + * "text": "The recognition result", + * "tokens": [x, x, x], + * "timestamps": [x, x, x], + * "segment": x, + * "start_frame": x, + * "is_final": true|false + * } + */ + std::string AsJsonString() const; +}; + +class OfflineStream { + public: + ~OfflineStream(); + + /** Create a stream. + * + * @param fbank Not owned by this class. + */ + OfflineStream(kaldifeat::Fbank *fbank, const FeatureConfig &feat_config, + ContextGraphPtr context_graph = nullptr); + + OfflineStream(kaldifeat::WhisperFbank *whisper, + const FeatureConfig &feat_config, + ContextGraphPtr context_graph = nullptr); + + /** Create a stream from a WAVE file. + * + * @param wave_file Path to the WAVE file. Its sample frequency should + * match the one from the feature extractor. Only + * WAVEs with a single channel are supported. + */ + void AcceptWaveFile(const std::string &wave_file); + + /** Create a stream from audio samples. + * + * @param fbank_ + * @param samples Pointer to the audio samples. It should be normalized + * to the range [-1, 1]. If you model expects unnormalized + * audio samples, please use `normalize_samples=false` when + * invoking the constructor but you still need to pass + * normalized samples `AcceptSamples()`. + * @param n Number of audio samples. + */ + void AcceptSamples(const float *samples, int32_t n); + + /** Create a stream from features. + * + * @param feature Pointer to the 2-D feature matrix of shape + * [num_frames][num_channels]. It should be contiguous + * in memory and stored in row major. + * @param num_frames Number of feature frames. + * @param num_channels It should match the one from the feature extractor. + */ + void AcceptFeatures(const float *feature, int32_t num_frames, + int32_t num_channels); + + /** Get the features of this stream. + * + * @return If return_waveform is false, it returns a 2-D tensor of shape + * (num_frames, num_channels). Otherwise, it returns a 1-D tensor + * of shape (num_samples,). + */ + const torch::Tensor &GetFeatures() const; + + /** Set the recognition result for this stream. */ + void SetResult(const OfflineRecognitionResult &r); + + /** Get the recognition result of this stream */ + const OfflineRecognitionResult &GetResult() const; + + /** Get the ContextGraph of this stream */ + const ContextGraphPtr &GetContextGraph() const; + + private: + class OfflineStreamImpl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_OFFLINE_STREAM_H_ diff --git a/sherpa/cpp_api/offline_recognizer.cc b/sherpa/cpp_api/offline_recognizer.cc deleted file mode 100644 index ff656610a..000000000 --- a/sherpa/cpp_api/offline_recognizer.cc +++ /dev/null @@ -1,133 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/cpp_api/offline_recognizer.h" - -#include - -#include "sherpa/csrc/log.h" -#include "sherpa/csrc/offline_asr.h" -#include "torch/script.h" - -namespace sherpa { - -class OfflineRecognizer::OfflineRecognizerImpl { - public: - OfflineRecognizerImpl(const std::string &nn_model, const std::string &tokens, - const DecodingOptions &decoding_opts, bool use_gpu, - float sample_rate) { - OfflineAsrOptions opts; - opts.nn_model = nn_model; - opts.tokens = tokens; - opts.use_gpu = use_gpu; - - switch (decoding_opts.method) { - case kGreedySearch: - opts.decoding_method = "greedy_search"; - break; - case kModifiedBeamSearch: - opts.decoding_method = "modified_beam_search"; - opts.num_active_paths = decoding_opts.num_active_paths; - break; - default: - SHERPA_LOG(FATAL) << "Unreachable code"; - break; - } - - // options for bank - opts.fbank_opts.frame_opts.dither = 0; - opts.fbank_opts.frame_opts.samp_freq = sample_rate; - opts.fbank_opts.mel_opts.num_bins = 80; - - asr_ = std::make_unique(opts); - expected_sample_rate_ = sample_rate; - } - - std::vector DecodeFileBatch( - const std::vector &filenames) { - std::vector res = - asr_->DecodeWaves(filenames, expected_sample_rate_); - return ToOfflineRecognitionResult(res); - } - - std::vector DecodeSamplesBatch( - const float **samples, const int32_t *samples_length, int32_t n) { - std::vector tensors(n); - for (int i = 0; i != n; ++i) { - auto t = torch::from_blob(const_cast(samples[i]), - {samples_length[i]}, torch::kFloat); - tensors[i] = std::move(t); - } - auto res = asr_->DecodeWaves(tensors); - return ToOfflineRecognitionResult(res); - } - - std::vector DecodeFeaturesBatch( - const float *features, const int32_t *features_length, int32_t N, - int32_t T, int32_t C) { - torch::Tensor tensor = torch::from_blob(const_cast(features), - {N, T, C}, torch::kFloat); - torch::Tensor length = torch::from_blob( - const_cast(features_length), {N}, torch::kInt); - - auto res = asr_->DecodeFeatures(tensor, length); - return ToOfflineRecognitionResult(res); - } - - private: - std::vector ToOfflineRecognitionResult( - const std::vector &res) const { - std::vector ans(res.size()); - for (size_t i = 0; i != res.size(); ++i) { - ans[i].text = std::move(res[i].text); - ans[i].tokens = std::move(res[i].tokens); - ans[i].timestamps = std::move(res[i].timestamps); - } - return ans; - } - - std::unique_ptr asr_; - float expected_sample_rate_; -}; - -OfflineRecognizer::OfflineRecognizer( - const std::string &nn_model, const std::string &tokens, - const DecodingOptions &decoding_opts /*= {}*/, bool use_gpu /*=false*/, - float sample_rate /*= 16000*/) - : impl_(std::make_unique( - nn_model, tokens, decoding_opts, use_gpu, sample_rate)) {} - -OfflineRecognizer::~OfflineRecognizer() = default; - -std::vector OfflineRecognizer::DecodeFileBatch( - const std::vector &filenames) { - return impl_->DecodeFileBatch(filenames); -} - -std::vector OfflineRecognizer::DecodeSamplesBatch( - const float **samples, const int32_t *samples_length, int32_t n) { - return impl_->DecodeSamplesBatch(samples, samples_length, n); -} - -std::vector OfflineRecognizer::DecodeFeaturesBatch( - const float *features, const int32_t *features_length, int32_t N, int32_t T, - int32_t C) { - return impl_->DecodeFeaturesBatch(features, features_length, N, T, C); -} - -} // namespace sherpa diff --git a/sherpa/cpp_api/offline_recognizer.h b/sherpa/cpp_api/offline_recognizer.h deleted file mode 100644 index 13b20841e..000000000 --- a/sherpa/cpp_api/offline_recognizer.h +++ /dev/null @@ -1,192 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CPP_API_OFFLINE_RECOGNIZER_H_ -#define SHERPA_CPP_API_OFFLINE_RECOGNIZER_H_ - -#include -#include -#include - -namespace sherpa { - -enum class DecodingMethod { - kGreedySearch = 0, - kModifiedBeamSearch = 1, -}; - -constexpr auto kGreedySearch = DecodingMethod::kGreedySearch; -constexpr auto kModifiedBeamSearch = DecodingMethod::kModifiedBeamSearch; - -struct DecodingOptions { - DecodingMethod method = kGreedySearch; - // kGreedySearch has no options - - // Options for kModifiedBeamSearch - int32_t num_active_paths = 4; -}; - -struct OfflineRecognitionResult { - // RecognitionResult results. - // For English, it consists of space separated words. - // For Chinese, it consists of Chinese words without spaces. - std::string text; - - // Decoded results at the token level. - // For instance, for BPE-based models it consists of a list of BPE tokens. - std::vector tokens; - - // timestamps.size() == tokens.size() - // timestamps[i] records the frame number on which tokens[i] is decoded. - // Frame numbers are counted after model subsampling. - std::vector timestamps; // not implemented at present -}; - -class OfflineRecognizer { - public: - /** Construct an instance of OfflineRecognizer. - * - * @param nn_model Path to the torchscript model. We assume the model - * is one of pruned_transducer_statelessX from icefall. - * @param tokens Path to the tokens.txt. Each line in this file has - * two columns separated by space(s). The first column is - * a symbol while the second column is the integer ID of - * the symbol. If you have a bpe.model, please convert it - * to tokens.txt first. - * @param decoding_opts Decoding options for this recognizer. - * @param use_gpu true to use GPU for neural network computation. - * false to use CPU. If true, we always select GPU 0. - * You can use the environment variable - * CUDA_VISIBLE_DEVICES to control which device should - * be mapped to GPU 0. - * @param sample_rate The expected audio sample rate of the model. - */ - OfflineRecognizer(const std::string &nn_model, const std::string &tokens, - const DecodingOptions &decoding_opts = {}, - bool use_gpu = false, float sample_rate = 16000); - - ~OfflineRecognizer(); - - /** Decode a single file. - * - * Only ".wav" format is supported. If the input wave file has multiple - * channels, only the first channel is used. - * - * Note that the sample rate of the input wave file must match the one - * expected by the model. No resampling is done if they differ. Instead - * it will abort. - * - * @param filename Path to the wave file. - * - * @return Return the recognition result. - */ - OfflineRecognitionResult DecodeFile(const std::string &filename) { - return DecodeFileBatch({filename})[0]; - } - - /** Decode a batch of files. - * - * Only ".wav" format is supported. If the input wave file has multiple - * channels, only the first channel is used. - * - * Note that the sample rate of the input wave file must match the one - * expected by the model. No resampling is done if they differ. Instead - * it will abort. - * - * @param filenames A list of paths to the waves files to be decoded. - * - * @return Return a list of recognition results. ans[i] is the results for - * filenames[i]. - */ - std::vector DecodeFileBatch( - const std::vector &filenames); - - /** Decode audio samples. - * - * The sample rate of the input samples should match the one expected - * by the model, which is 16 kHz for models from icefall. - * - * @param samples Pointer to a 1-D array of length `N` containing audio - * samples which should be normalized to the range [-1, 1] - * if you use a model from icefall. It should be on CPU. - * - * @param n Length of the input samples. - * - * @return Return the recognition result. - */ - OfflineRecognitionResult DecodeSamples(const float *samples, int32_t n) { - const float *samples_array[1] = {samples}; - return DecodeSamplesBatch(samples_array, &n, 1)[0]; - } - - /** Decode a batch of audio samples - * - * The sample rate of the input samples should match the one expected - * by the model, which is 16 kHz for models from icefall. - * - * @param samples Pointer to a 1-D array of length `n` containing pointers to - * 1-D arrays of audio samples. All samples should be on CPU. - * - * @param samples_length Pointer to a 1-D array of length `n`. - * samples_length[i] contains the number of samples - * in samples[i]. It should be on CPU. - * - * @return Return the recognition results. - */ - std::vector DecodeSamplesBatch( - const float **samples, const int32_t *samples_length, int32_t n); - - /** Decode fbank features. - * - * @param features Pointer to a 2-D array of shape (T, C). It is in row-major - * and should be on CPU. - * @param T Number of feature frames in `features`. - * @param C Feature dimension which should match the one expected by the - * model. - * - * @return Return the recognition result. - */ - OfflineRecognitionResult DecodeFeatures(const float *features, int32_t T, - int32_t C) { - return DecodeFeaturesBatch(features, &T, 1, T, C)[0]; - } - - /** Decode a batch of fbank features. - * - * @param features Pointer to a 3-D tensor of shape (N, T, C). It is in - * row-major and should be on CPU. - * @param features_length Pointer to a 1-D tensor of shape (N,) containing - * number of valid frames in `features` before - * padding. It should be on CPU. - * @param N Batch size. - * @param T Number of feature frames. - * @param C Feature dimension. It must match the one expected by the model. - * - * @return Return the recognition results. - */ - std::vector DecodeFeaturesBatch( - const float *features, const int32_t *features_length, int32_t N, - int32_t T, int32_t C); - - private: - class OfflineRecognizerImpl; - std::unique_ptr impl_; -}; - -} // namespace sherpa - -#endif // SHERPA_CPP_API_OFFLINE_RECOGNIZER_H_ diff --git a/sherpa/cpp_api/online-recognizer.cc b/sherpa/cpp_api/online-recognizer.cc new file mode 100644 index 000000000..7f0ac934b --- /dev/null +++ b/sherpa/cpp_api/online-recognizer.cc @@ -0,0 +1,566 @@ +// sherpa/cpp_api/online-recognizer.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/online-recognizer.h" + +#include +#include +#include + +#include "nlohmann/json.hpp" +#include "sherpa/csrc/byte_util.h" +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/log.h" +#include "sherpa/csrc/online-conformer-transducer-model.h" +#include "sherpa/csrc/online-conv-emformer-transducer-model.h" +#include "sherpa/csrc/online-emformer-transducer-model.h" +#include "sherpa/csrc/online-lstm-transducer-model.h" +#include "sherpa/csrc/online-transducer-decoder.h" +#include "sherpa/csrc/online-transducer-fast-beam-search-decoder.h" +#include "sherpa/csrc/online-transducer-greedy-search-decoder.h" +#include "sherpa/csrc/online-transducer-model.h" +#include "sherpa/csrc/online-transducer-modified-beam-search-decoder.h" +#include "sherpa/csrc/online-zipformer-transducer-model.h" +#include "sherpa/csrc/online-zipformer2-transducer-model.h" +#include "sherpa/csrc/symbol-table.h" + +namespace sherpa { + +std::string OnlineRecognitionResult::AsJsonString() const { + using json = nlohmann::json; + json j; + j["text"] = text; + j["start_time"] = start_time; + j["tokens"] = tokens; + + // std::ostringstream os; + // os << "["; + // std::string sep = ""; + // for (auto t : timestamps) { + // os << sep << std::fixed << std::setprecision(2) << t; + // sep = ","; + // } + // os << "]"; + + // NOTE: We don't use j["timestamps"] = timestamps; + // because we need to control the number of decimal points to keep + j["timestamps"] = timestamps; // os.str(); + + // TODO(fangjun): The key in the json object should be kept + // in sync with sherpa/bin/pruned_transducer_statelessX/streaming_server.py + j["segment"] = segment; // TODO(fangjun): Support endpointing + j["final"] = is_final; + return j.dump(); +} + +void OnlineRecognizerConfig::Register(ParseOptions *po) { + feat_config.Register(po); + endpoint_config.Register(po); + fast_beam_search_config.Register(po); + + po->Register("nn-model", &nn_model, "Path to the torchscript model"); + + po->Register("encoder-model", &encoder_model, + "Path to the encoder model for OnlineLstmTransducerModel."); + + po->Register("decoder-model", &decoder_model, + "Path to the decoder model for OnlineLstmTransducerModel."); + + po->Register("joiner-model", &joiner_model, + "Path to the joiner model for OnlineLstmTransducerModel."); + + po->Register("tokens", &tokens, "Path to tokens.txt."); + + po->Register("use-gpu", &use_gpu, + "true to use GPU for computation. false to use CPU.\n" + "If true, it uses the first device. You can use the environment " + "variable CUDA_VISIBLE_DEVICES to select which device to use."); + + po->Register("use-endpoint", &use_endpoint, + "true to enable Endpoint, false to disable Endpoint, " + "default is false.\n"); + + po->Register("decoding-method", &decoding_method, + "Decoding method to use. Possible values are: greedy_search, " + "modified_beam_search, and fast_beam_search. " + "Used only for transducer."); + + po->Register("num-active-paths", &num_active_paths, + "Number of active paths for modified_beam_search. " + "Used only when --decoding-method is modified_beam_search"); + + po->Register("context-score", &context_score, + "The bonus score for each token in context word/phrase. " + "Used only when decoding_method is modified_beam_search"); + + po->Register("decode-left-context", &left_context, + "Used only for streaming Conformer, i.e, models from " + "pruned_transducer_statelessX, " + "and streaming Zipformer, i.e, models from " + "pruned_transducer_stateless7_streaming in icefall." + "Number of frames before subsampling during decoding."); + + po->Register("decode-right-context", &right_context, + "Used only for streaming Conformer, i.e, models from " + "pruned_transducer_statelessX, " + "and streaming Zipformer, i.e, models from " + "pruned_transducer_stateless7_streaming in icefall." + "Number of frames before subsampling during decoding."); + + po->Register("decode-chunk-size", &chunk_size, + "Used only for streaming Conformer, i.e, models from " + "pruned_transducer_statelessX, " + "and streaming Zipformer, i.e, models from " + "pruned_transducer_stateless7_streaming in icefall." + "Number of frames before subsampling during decoding."); + + po->Register("use-bbpe", &use_bbpe, + "true if the model to use is trained with byte level bpe, " + "The byte level bpe modeling unit is mainly used on CJK " + "languages or multilingual datasets, it can further break " + "the multi-byte unicode characters into byte sequence and " + "then train some kind of sub-char bpes."); + + po->Register("temperature", &temperature, + "Softmax temperature,. " + "Used only when decoding_method is modified_beam_search."); +} + +void OnlineRecognizerConfig::Validate() const { + if (!nn_model.empty()) { + SHERPA_CHECK_EQ(encoder_model.empty(), true); + SHERPA_CHECK_EQ(decoder_model.empty(), true); + SHERPA_CHECK_EQ(joiner_model.empty(), true); + + AssertFileExists(nn_model); + } else { + SHERPA_CHECK_EQ(encoder_model.empty(), false) + << "If you don't provide --nn-model, please provide --encoder_model " + "instead"; + SHERPA_CHECK_EQ(decoder_model.empty(), false); + SHERPA_CHECK_EQ(joiner_model.empty(), false); + + AssertFileExists(decoder_model); + AssertFileExists(decoder_model); + AssertFileExists(joiner_model); + } + + if (tokens.empty()) { + SHERPA_LOG(FATAL) << "Please provide --tokens"; + } + AssertFileExists(tokens); + + if (decoding_method != "greedy_search" && + decoding_method != "modified_beam_search" && + decoding_method != "fast_beam_search") { + SHERPA_LOG(FATAL) + << "Unsupported decoding method: " << decoding_method + << ". Supported values are: greedy_search, modified_beam_search, " + << "fast_beam_search."; + } + + if (decoding_method == "modified_beam_search") { + SHERPA_CHECK_GT(num_active_paths, 0); + } +} + +std::string OnlineRecognizerConfig::ToString() const { + std::ostringstream os; + os << "OnlineRecognizerConfig("; + os << "feat_config=" << feat_config.ToString() << ", "; + os << "endpoint_config=" << endpoint_config.ToString() << ", "; + os << "fast_beam_search_config=" << fast_beam_search_config.ToString() + << ", "; + os << "nn_model=\"" << nn_model << "\", "; + os << "tokens=\"" << tokens << "\", "; + os << "encoder_model=\"" << encoder_model << "\", "; + os << "decoder_model=\"" << decoder_model << "\", "; + os << "joiner_model=\"" << joiner_model << "\", "; + os << "use_gpu=" << (use_gpu ? "True" : "False") << "\", "; + os << "use_endpoint=" << (use_endpoint ? "True" : "False") << "\", "; + os << "decoding_method=\"" << decoding_method << "\", "; + os << "num_active_paths=" << num_active_paths << ", "; + os << "context_score=" << context_score << ", "; + os << "left_context=" << left_context << ", "; + os << "right_context=" << right_context << ", "; + os << "chunk_size=" << chunk_size << ", "; + os << "use_bbpe=" << (use_bbpe ? "True" : "False") << ", "; + os << "temperature=" << temperature << ")"; + return os.str(); +} + +static OnlineRecognitionResult Convert(const OnlineTransducerDecoderResult &src, + const SymbolTable &sym_table, + int32_t frame_shift_ms, + int32_t subsampling_factor, + bool use_bbpe) { + OnlineRecognitionResult r; + r.tokens.reserve(src.tokens.size()); + r.timestamps.reserve(src.timestamps.size()); + + std::string text; + for (auto i : src.tokens) { + auto sym = sym_table[i]; + text.append(sym); + + r.tokens.push_back(std::move(sym)); + } + + if (use_bbpe) { + auto bu = GetByteUtil(); + text = bu->Decode(text); + } + + r.text = std::move(text); + + float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor; + for (auto t : src.timestamps) { + float time = frame_shift_s * t; + r.timestamps.push_back(time); + } + return r; +} + +class OnlineRecognizer::OnlineRecognizerImpl { + public: + explicit OnlineRecognizerImpl(const OnlineRecognizerConfig &config) + : config_(config), + symbol_table_(config.tokens), + endpoint_(std::make_unique(config.endpoint_config)) { + if (config.use_gpu) { + device_ = torch::Device("cuda:0"); + } + + std::string class_name; + if (config.nn_model.empty()) { + // for torch.jit.trace + torch::jit::Module encoder = + torch::jit::load(config.encoder_model, torch::kCPU); + class_name = encoder.type()->name()->name(); + + if (class_name == "RNN") { + // For OnlineLstmTransducerModel + model_ = std::make_unique( + config.encoder_model, config.decoder_model, config.joiner_model, + device_); + } else if (class_name == "Zipformer") { + // For OnlineZipformerTransducerModel + // model generated by torch.jit.trace() + + model_ = std::make_unique( + config_.encoder_model, config.decoder_model, config.joiner_model, + device_); + } + } else { + torch::jit::Module m = torch::jit::load(config.nn_model, torch::kCPU); + auto encoder = m.attr("encoder").toModule(); + class_name = encoder.type()->name()->name(); + + if (class_name == "Emformer") { + if (encoder.find_method("infer")) { + // Emformer from torchaudio + model_ = std::make_unique( + config.nn_model, device_); + } else { + // ConvEmformer from icefall + model_ = std::make_unique( + config.nn_model, device_); + } + } else if (class_name == "Conformer") { + int32_t left_context = config.left_context; + int32_t right_context = config.right_context; + int32_t chunk_size = config.chunk_size; + SHERPA_CHECK_GT(left_context, 0); + SHERPA_CHECK_GE(right_context, 0); + SHERPA_CHECK_GT(chunk_size, 0); + + model_ = std::make_unique( + config.nn_model, left_context, right_context, chunk_size, device_); + } else if (class_name == "Zipformer") { + // For OnlineZipformerTransducerModel + // model generated by torch.jit.script() + model_ = std::make_unique( + config.nn_model, device_); + } else if (class_name == "StreamingEncoderModel") { + // For OnlineZipformer2TransducerModel + // model generated by torch.jit.script() + model_ = std::make_unique( + config.nn_model, device_); + } + } + + if (!model_) { + std::ostringstream os; + os << "Support only the following streaming models from icefall:" + << "\n" + << "conv_emformer_transducer_stateless2" + << "\n" + << "lstm_transducer_stateless2" + << "\n" + << "pruned_stateless_emformer_rnnt2" + << "\n" + << "pruned_transducer_stateless{2,3,4,5}" + << "\n" + << "pruned_transducer_stateless7_streaming" + << "\n" + << "zipformer" + << "\n" + << "Given: " << class_name << "\n"; + SHERPA_LOG(FATAL) << os.str(); + } + + WarmUp(); + + if (config.decoding_method == "greedy_search") { + decoder_ = + std::make_unique(model_.get()); + } else if (config.decoding_method == "modified_beam_search") { + decoder_ = std::make_unique( + model_.get(), config.num_active_paths, config.temperature); + } else if (config.decoding_method == "fast_beam_search") { + config.fast_beam_search_config.Validate(); + + decoder_ = std::make_unique( + model_.get(), config.fast_beam_search_config); + } else { + TORCH_CHECK(false, + "Unsupported decoding method: ", config.decoding_method); + } + } + + void InitOnlineStream(OnlineStream *stream) const { + auto r = decoder_->GetEmptyResult(); + + if (config_.decoding_method == "modified_beam_search" && + nullptr != stream->GetContextGraph()) { + // r.hyps has only one element. + for (auto it = r.hyps.begin(); it != r.hyps.end(); ++it) { + it->second.context_state = stream->GetContextGraph()->Root(); + } + } + + stream->SetResult(r); + + auto state = model_->GetEncoderInitStates(); + stream->SetState(state); + } + + std::unique_ptr CreateStream() { + auto s = std::make_unique(config_.feat_config); + InitOnlineStream(s.get()); + return s; + } + + std::unique_ptr CreateStream( + const std::vector> &contexts) { + // We create context_graph at this level, because we might have default + // context_graph(will be added later if needed) that belongs to the whole + // model rather than each stream. + auto context_graph = + std::make_shared(contexts, config_.context_score); + auto s = std::make_unique(config_.feat_config, + context_graph); + InitOnlineStream(s.get()); + return s; + } + + bool IsReady(OnlineStream *s) { + // TODO(fangjun): Pass chunk_size to OnlineStream on creation + int32_t chunk_size = model_->ChunkSize(); + return s->NumFramesReady() - s->GetNumProcessedFrames() >= chunk_size; + } + + void DecodeStreams(OnlineStream **ss, int32_t n) { + InferenceMode no_grad; + + SHERPA_CHECK_GT(n, 0); + + auto device = model_->Device(); + int32_t chunk_size = model_->ChunkSize(); + int32_t chunk_shift = model_->ChunkShift(); + + std::vector all_features(n); + std::vector all_states(n); + std::vector all_processed_frames(n); + std::vector all_results(n); + bool has_context_graph = false; + for (int32_t i = 0; i != n; ++i) { + OnlineStream *s = ss[i]; + + if (!has_context_graph && s->GetContextGraph()) has_context_graph = true; + + SHERPA_CHECK(IsReady(s)); + int32_t num_processed_frames = s->GetNumProcessedFrames(); + + std::vector features_vec(chunk_size); + for (int32_t k = 0; k != chunk_size; ++k) { + features_vec[k] = s->GetFrame(num_processed_frames + k); + } + + torch::Tensor features = torch::cat(features_vec, /*dim*/ 0); + + all_features[i] = std::move(features); + all_states[i] = s->GetState(); + all_processed_frames[i] = num_processed_frames; + all_results[i] = s->GetResult(); + } // for (int32_t i = 0; i != n; ++i) { + + auto batched_features = torch::stack(all_features, /*dim*/ 0); + batched_features = batched_features.to(device); + + torch::Tensor features_length = + torch::full({n}, chunk_size, torch::kLong).to(device); + + torch::IValue stacked_states = model_->StackStates(all_states); + torch::Tensor processed_frames = + torch::tensor(all_processed_frames, torch::kLong).to(device); + + torch::Tensor encoder_out; + torch::Tensor encoder_out_lens; + torch::IValue next_states; + + std::tie(encoder_out, encoder_out_lens, next_states) = model_->RunEncoder( + batched_features, features_length, processed_frames, stacked_states); + + if (has_context_graph) { + decoder_->Decode(encoder_out, ss, n, &all_results); + } else { + decoder_->Decode(encoder_out, &all_results); + } + + std::vector unstacked_states = + model_->UnStackStates(next_states); + + for (int32_t i = 0; i != n; ++i) { + OnlineStream *s = ss[i]; + all_results[i].num_processed_frames += chunk_shift; + s->SetResult(all_results[i]); + s->SetState(std::move(unstacked_states[i])); + s->GetNumProcessedFrames() += chunk_shift; // TODO(fangjun): Remove it + } + } + + OnlineRecognitionResult GetResult(OnlineStream *s) { + auto r = s->GetResult(); // we use a copy here as we will change it below + + // Caution: FinalizeResult should be invoked before StripLeadingBlanks. + bool is_endpoint = config_.use_endpoint && IsEndpoint(s); + bool is_final = !IsReady(s) && s->IsLastFrame(s->NumFramesReady() - 1); + + if (is_endpoint || is_final) { + decoder_->FinalizeResult(s, &r); + } + + decoder_->StripLeadingBlanks(&r); + + auto ans = Convert(r, symbol_table_, + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms, + model_->SubsamplingFactor(), config_.use_bbpe); + + ans.is_final = is_final || is_endpoint; + ans.segment = s->GetWavSegment(); + float frame_shift_s = + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms / 1000.; + ans.start_time = s->GetStartFrame() * frame_shift_s; + s->GetNumTrailingBlankFrames() = r.num_trailing_blanks; + + if (is_endpoint) { + auto r = decoder_->GetEmptyResult(); + + if (config_.decoding_method == "modified_beam_search" && + nullptr != s->GetContextGraph()) { + // r.hyps has only one element. + for (auto it = r.hyps.begin(); it != r.hyps.end(); ++it) { + it->second.context_state = s->GetContextGraph()->Root(); + } + } + + s->SetResult(r); + s->GetWavSegment() += 1; + s->GetStartFrame() = s->GetNumProcessedFrames(); + s->GetNumTrailingBlankFrames() = 0; + } + return ans; + } + + bool IsEndpoint(OnlineStream *s) const { + return endpoint_->IsEndpoint( + s->GetNumProcessedFrames() - s->GetStartFrame(), + s->GetNumTrailingBlankFrames() * model_->SubsamplingFactor(), + config_.feat_config.fbank_opts.frame_opts.frame_shift_ms / 1000.0); + } + + const OnlineRecognizerConfig &GetConfig() const { return config_; } + + private: + void WarmUp() { + SHERPA_LOG(INFO) << "WarmUp begins"; + torch::Tensor features = + torch::rand({1, model_->ChunkSize(), + config_.feat_config.fbank_opts.mel_opts.num_bins}, + device_); + torch::Tensor features_length = + torch::full({features.size(0)}, model_->ChunkSize(), torch::kLong) + .to(device_); + model_->WarmUp(features, features_length); + +#if 0 + // We don't use the following code since we want to set `model_->vocab_size` + auto s = CreateStream(); + float sample_rate = config_.feat_config.fbank_opts.frame_opts.samp_freq; + torch::tensor samples({2 * static_cast(sample_rate)}, + torch::kFloat); + + s->AcceptWaveform(sample_rate, samples); + s->InputFinished(); + OnlineStream ss[1] = {s.get()}; + DecodeStreams(ss, 1); +#endif + + SHERPA_LOG(INFO) << "WarmUp ended"; + } + + private: + OnlineRecognizerConfig config_; + torch::Device device_{"cpu"}; + std::unique_ptr model_; + std::unique_ptr decoder_; + SymbolTable symbol_table_; + std::unique_ptr endpoint_; +}; + +OnlineRecognizer::OnlineRecognizer(const OnlineRecognizerConfig &config) + : impl_(std::make_unique(config)) {} + +OnlineRecognizer::~OnlineRecognizer() = default; + +std::unique_ptr OnlineRecognizer::CreateStream() { + return impl_->CreateStream(); +} + +std::unique_ptr OnlineRecognizer::CreateStream( + const std::vector> &contexts_list) { + return impl_->CreateStream(contexts_list); +} + +bool OnlineRecognizer::IsReady(OnlineStream *s) { return impl_->IsReady(s); } + +bool OnlineRecognizer::IsEndpoint(OnlineStream *s) { + return impl_->IsEndpoint(s); +} + +void OnlineRecognizer::DecodeStreams(OnlineStream **ss, int32_t n) { + InferenceMode no_grad; + impl_->DecodeStreams(ss, n); +} + +OnlineRecognitionResult OnlineRecognizer::GetResult(OnlineStream *s) { + return impl_->GetResult(s); +} + +const OnlineRecognizerConfig &OnlineRecognizer::GetConfig() const { + return impl_->GetConfig(); +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/online-recognizer.h b/sherpa/cpp_api/online-recognizer.h new file mode 100644 index 000000000..9c82f8ac8 --- /dev/null +++ b/sherpa/cpp_api/online-recognizer.h @@ -0,0 +1,140 @@ +// sherpa/cpp_api/online-recognizer.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_ONLINE_RECOGNIZER_H_ +#define SHERPA_CPP_API_ONLINE_RECOGNIZER_H_ + +#include +#include +#include + +#include "sherpa/cpp_api/endpoint.h" +#include "sherpa/cpp_api/fast-beam-search-config.h" +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/macros.h" +#include "sherpa/cpp_api/online-stream.h" + +namespace sherpa { + +struct OnlineRecognizerConfig { + /// Config for the feature extractor + FeatureConfig feat_config; + + EndpointConfig endpoint_config; + + FastBeamSearchConfig fast_beam_search_config; + + /// Path to the torchscript model + std::string nn_model; + + /// Path to tokens.txt + std::string tokens; + + // The following three are for RnntLstmModel + std::string encoder_model; + std::string decoder_model; + std::string joiner_model; + + /// true to use GPU for neural network computation and decoding. + /// false to use CPU. + /// You can use CUDA_VISIBLE_DEVICES to control which device to use. + /// We always use GPU 0 in the code. This also implies it supports only + /// 1 GPU at present. + /// Note: You have to use a CUDA version of PyTorch in order to use + /// GPU for computation + bool use_gpu = false; + + bool use_endpoint = false; + + std::string decoding_method = "greedy_search"; + + /// used only for modified_beam_search + int32_t num_active_paths = 4; + + /// used only for modified_beam_search + float context_score = 1.5; + + // For OnlineConformerTransducerModel, i.e., for models from + // pruned_transducer_stateless{2,3,4,5} in icefall + // In number of frames after subsampling + int32_t left_context = 64; + + // For OnlineConformerTransducerModel, i.e., for models from + // pruned_transducer_stateless{2,3,4,5} in icefall + // In number of frames after subsampling + int32_t right_context = 0; + + // For OnlineConformerTransducerModel, i.e., for models from + // pruned_transducer_stateless{2,3,4,5} in icefall + // In number of frames after subsampling + int32_t chunk_size = 12; + + // True if the model used is trained with byte level bpe. + bool use_bbpe = false; + + // temperature for the softmax in the joiner + float temperature = 1.0; + + void Register(ParseOptions *po); + + void Validate() const; + + /** A string representation for debugging purpose. */ + std::string ToString() const; +}; + +class OnlineRecognizer { + public: + /** Construct an instance of OnlineRecognizer. + * + * @param config Configuration for the recognizer. + */ + explicit OnlineRecognizer(const OnlineRecognizerConfig &config); + + ~OnlineRecognizer(); + + const OnlineRecognizerConfig &GetConfig() const; + + // Create a stream for decoding. + std::unique_ptr CreateStream(); + + // Create a stream with context phrases + std::unique_ptr CreateStream( + const std::vector> &context_list); + + /** + * Return true if the given stream has enough frames for decoding. + * Return false otherwise + */ + bool IsReady(OnlineStream *s); + + /** + * Return true if VAD activity + * Return false otherwise + */ + bool IsEndpoint(OnlineStream *s); + + /** Decode a single stream. */ + void DecodeStream(OnlineStream *s) { + OnlineStream *ss[1] = {s}; + DecodeStreams(ss, 1); + } + + /** Decode multiple streams in parallel + * + * @param ss Pointer array containing streams to be decoded. + * @param n Number of streams in `ss`. + */ + void DecodeStreams(OnlineStream **ss, int32_t n); + + OnlineRecognitionResult GetResult(OnlineStream *s); + + private: + class OnlineRecognizerImpl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_ONLINE_RECOGNIZER_H_ diff --git a/sherpa/cpp_api/online-stream.h b/sherpa/cpp_api/online-stream.h new file mode 100644 index 000000000..c591916fe --- /dev/null +++ b/sherpa/cpp_api/online-stream.h @@ -0,0 +1,157 @@ +// sherpa/cpp_api/online-stream.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CPP_API_ONLINE_STREAM_H_ +#define SHERPA_CPP_API_ONLINE_STREAM_H_ + +#include +#include +#include +#include + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/csrc/context-graph.h" +#include "torch/script.h" + +namespace sherpa { + +struct OnlineRecognitionResult { + /// Recognition results. + /// For English, it consists of space separated words. + /// For Chinese, it consists of Chinese words without spaces. + std::string text; + + /// Decoded results at the token level. + /// For instance, for BPE-based models it consists of a list of BPE tokens. + std::vector tokens; + + /// timestamps.size() == tokens.size() + /// timestamps[i] records the time in seconds when tokens[i] is decoded. + std::vector timestamps; + + /// ID of this segment + int32_t segment = 0; + + /// Starting frame of this segment. + float start_time = 0; + + /// True if this is the last segment. + bool is_final = false; + + /** Return a json string. + * + * The returned string contains: + * { + * "text": "The recognition result", + * "tokens": [x, x, x], + * "timestamps": [x, x, x], + * "segment": x, + * "start_time": x, + * "is_final": true|false + * } + */ + std::string AsJsonString() const; +}; + +class Hypotheses; +struct OnlineTransducerDecoderResult; + +class OnlineStream { + public: + explicit OnlineStream(const FeatureConfig &feat_config, + ContextGraphPtr context_graph = nullptr); + ~OnlineStream(); + + /** This would be called from the application, when you get + * more wave data. + * + * @param sampling_rate Sampling rate of the input waveform. If it is + * different from the sampling rate expected by the + * model, we will do resampling inside sherpa. + * @param waveform A 1-D array containing audio samples. For + * models from icefall, the samples should be in the + * range [-1, 1]. + */ + void AcceptWaveform(int32_t sampling_rate, torch::Tensor waveform); + + /** Returns the total number of frames, since the start of the utterance, that + * are now available. In an online-decoding context, this will likely + * increase with time as more data becomes available. + */ + int32_t NumFramesReady() const; + + /** Returns true if this is the last frame. + * + * Frame indices are zero-based, so the first frame is zero. + */ + bool IsLastFrame(int32_t frame) const; + + /** InputFinished() tells the class you won't be providing any more waveform. + * + * It also affects the return value of IsLastFrame(). + */ + void InputFinished(); + + /**Get a frame by its index. + * + * @param frame The frame number. It starts from 0. + * + * @return Return a 2-D array of shape [1, feature_dim] + */ + torch::Tensor GetFrame(int32_t frame); + + /** + * Get the state of the encoder network corresponding to this stream. + * + * @return Return the state of the encoder network for this stream. + */ + torch::IValue GetState() const; + + /** + * Set the state of the encoder network corresponding to this stream. + * + * @param state The state to set. + */ + void SetState(torch::IValue state); + + /** + * Get the context graph corresponding to this stream. + * + * @return Return the context graph for this stream. + */ + const ContextGraphPtr &GetContextGraph() const; + + // Return a reference to the number of processed frames so far. + // Initially, it is 0. It is always less than NumFramesReady(). + // + // The returned reference is valid as long as this object is alive. + int32_t &GetNumProcessedFrames(); + + void SetResult(const OnlineTransducerDecoderResult &r); + const OnlineTransducerDecoderResult &GetResult() const; + + // Return a reference to the decoder output of the last chunk. + // Its shape is [1, decoder_dim] + torch::Tensor &GetDecoderOut(); + + // Used only for greedy search + // + // Get number of trailing blank frames decoded so far + // + // The returned reference is valid as long as this object is alive. + int32_t &GetNumTrailingBlankFrames(); + + // Return ID of this segment in Stream + int32_t &GetWavSegment(); + + // Return Starting frame of this segment. + int32_t &GetStartFrame(); + + private: + class OnlineStreamImpl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_ONLINE_STREAM_H_ diff --git a/sherpa/csrc/parse_options.h b/sherpa/cpp_api/parse-options.h similarity index 91% rename from sherpa/csrc/parse_options.h rename to sherpa/cpp_api/parse-options.h index c8bc8edff..d70d1ee59 100644 --- a/sherpa/csrc/parse_options.h +++ b/sherpa/cpp_api/parse-options.h @@ -1,28 +1,13 @@ -/** - * Copyright 2009-2011 Karel Vesely; Microsoft Corporation; - * Saarland University (Author: Arnab Ghoshal); - * Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - +// sherpa/cpp_api/feature-config.h +// +// Copyright (c) 2022 Xiaomi Corporation +// // This file is copied and modified from kaldi/src/util/parse-options.h -#ifndef SHERPA_CSRC_PARSE_OPTIONS_H_ -#define SHERPA_CSRC_PARSE_OPTIONS_H_ +#ifndef SHERPA_CPP_API_PARSE_OPTIONS_H_ +#define SHERPA_CPP_API_PARSE_OPTIONS_H_ +#include #include #include #include @@ -265,4 +250,4 @@ void ReadConfigsFromFile(const std::string &conf, C1 *c1, C2 *c2) { } // namespace sherpa -#endif // SHERPA_CSRC_PARSE_OPTIONS_H_ +#endif // SHERPA_CPP_API_PARSE_OPTIONS_H_ diff --git a/sherpa/cpp_api/test-feature-config.cc b/sherpa/cpp_api/test-feature-config.cc new file mode 100644 index 000000000..638b1a099 --- /dev/null +++ b/sherpa/cpp_api/test-feature-config.cc @@ -0,0 +1,18 @@ +// sherpa/cpp_api/test-feature-config.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/parse-options.h" + +int main(int argc, char *argv[]) { + sherpa::ParseOptions po(""); + sherpa::FeatureConfig feat_config; + feat_config.Register(&po); + po.Read(argc, argv); + po.PrintUsage(); + + std::cout << feat_config << "\n"; + + return 0; +} diff --git a/sherpa/cpp_api/test-offline-stream.cc b/sherpa/cpp_api/test-offline-stream.cc new file mode 100644 index 000000000..8193158bf --- /dev/null +++ b/sherpa/cpp_api/test-offline-stream.cc @@ -0,0 +1,49 @@ +// sherpa/cpp_api/test-offline-stream.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/offline-stream.h" + +int main(int argc, char *argv[]) { + sherpa::FeatureConfig feat_config; + kaldifeat::Fbank fbank(feat_config.fbank_opts); + sherpa::OfflineRecognitionResult r; + r.text = "hello world"; + + if (argc == 2) { + std::cout << "===test from wave file===\n"; + sherpa::OfflineStream s(&fbank, feat_config); + s.AcceptWaveFile(argv[1]); + auto f = s.GetFeatures(); + std::cout << "f.sizes(): " << f.sizes() << "\n"; + s.SetResult(r); + std::cout << s.GetResult().text << "\n"; + } + + { + std::cout << "===test from samples===\n"; + torch::Tensor samples = torch::rand({160000}, torch::kFloat); + sherpa::OfflineStream s(&fbank, feat_config); + s.AcceptSamples(samples.data_ptr(), samples.numel()); + auto f = s.GetFeatures(); + std::cout << "f.sizes(): " << f.sizes() << "\n"; + s.SetResult(r); + std::cout << s.GetResult().text << "\n"; + } + + { + std::cout << "===test from features===\n"; + torch::Tensor features = torch::rand( + {50, feat_config.fbank_opts.mel_opts.num_bins}, torch::kFloat); + sherpa::OfflineStream s(&fbank, feat_config); + s.AcceptFeatures(features.data_ptr(), features.size(0), + features.size(1)); + auto f = s.GetFeatures(); + std::cout << "f.sizes(): " << f.sizes() << "\n"; + s.SetResult(r); + std::cout << s.GetResult().text << "\n"; + } + + return 0; +} diff --git a/sherpa/cpp_api/test_decode_features.cc b/sherpa/cpp_api/test_decode_features.cc deleted file mode 100644 index 14d87d28c..000000000 --- a/sherpa/cpp_api/test_decode_features.cc +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include "kaldi_native_io/csrc/kaldi-io.h" -#include "kaldi_native_io/csrc/wave-reader.h" -#include "sherpa/cpp_api/offline_recognizer.h" -#include "sherpa/csrc/fbank_features.h" -#include "torch/script.h" - -/** Read wave samples from a file. - * - * If the file has multiple channels, only the first channel is returned. - * Samples are normalized to the range [-1, 1). - * - * @param filename Path to the wave file. Only "*.wav" format is supported. - * @param expected_sample_rate Expected sample rate of the wave file. It aborts - * if the sample rate of the given file is not - * equal to this value. - * - * @return Return a 1-D torch.float32 tensor containing audio samples - * in the range [-1, 1) - */ -static torch::Tensor ReadWave(const std::string &filename, - float expected_sample_rate) { - bool binary = true; - kaldiio::Input ki(filename, &binary); - kaldiio::WaveHolder wh; - if (!wh.Read(ki.Stream())) { - std::cerr << "Failed to read " << filename; - exit(EXIT_FAILURE); - } - - auto &wave_data = wh.Value(); - if (wave_data.SampFreq() != expected_sample_rate) { - std::cerr << filename << "is expected to have sample rate " - << expected_sample_rate << ". Given " << wave_data.SampFreq(); - exit(EXIT_FAILURE); - } - - auto &d = wave_data.Data(); - - if (d.NumRows() > 1) { - std::cerr << "Only the first channel from " << filename << " is used"; - } - - auto tensor = torch::from_blob(const_cast(d.RowData(0)), - {d.NumCols()}, torch::kFloat); - - return tensor / 32768; -} - -int main(int argc, char *argv[]) { - if (argc < 4) { - std::cerr << "Usage: ./bin/test_decode_file /path/to/nn_model " - "/path/to/tokens.txt foo.wav [bar.wav [foobar.wav] ... ]\n"; - exit(EXIT_FAILURE); - } - std::string nn_model = argv[1]; - std::string tokens = argv[2]; - float sample_rate = 16000; - bool use_gpu = false; - - sherpa::DecodingOptions opts; - opts.method = sherpa::kGreedySearch; - sherpa::OfflineRecognizer recognizer(nn_model, tokens, opts, use_gpu, - sample_rate); - - kaldifeat::FbankOptions fbank_opts; - fbank_opts.frame_opts.dither = 0; - fbank_opts.frame_opts.samp_freq = sample_rate; - fbank_opts.mel_opts.num_bins = 80; - - kaldifeat::Fbank fbank(fbank_opts); // always on CPU - - if (argc == 4) { - std::cout << "Decode single file\n"; - - auto samples = ReadWave(argv[3], sample_rate); - auto feature = fbank.ComputeFeatures(samples, 1.0); - - auto result = recognizer.DecodeFeatures(feature.data_ptr(), - feature.size(0), 80); - std::cout << argv[3] << "\n" << result.text << "\n"; - return 0; - } - - std::cout << "Decode multiple files\n"; - - std::vector features; - std::vector features_length; - for (int i = 3; i != argc; ++i) { - auto samples = ReadWave(argv[i], sample_rate); - auto feature = fbank.ComputeFeatures(samples, 1.0); - features.push_back(feature); - - features_length.push_back(feature.size(0)); - } - torch::Tensor padded_features = torch::nn::utils::rnn::pad_sequence( - features, /*batch_first*/ true, - /*padding_value*/ -23.025850929940457f); - - auto results = recognizer.DecodeFeaturesBatch( - padded_features.data_ptr(), features_length.data(), - padded_features.size(0), padded_features.size(1), 80); - - for (size_t i = 0; i != features_length.size(); ++i) { - std::cout << argv[i + 3] << "\n" << results[i].text << "\n\n"; - } - - return 0; -} diff --git a/sherpa/cpp_api/test_decode_files.cc b/sherpa/cpp_api/test_decode_files.cc deleted file mode 100644 index 090fac59d..000000000 --- a/sherpa/cpp_api/test_decode_files.cc +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include "sherpa/cpp_api/offline_recognizer.h" - -int main(int argc, char *argv[]) { - if (argc < 4) { - std::cerr << "Usage: ./bin/test_decode_files /path/to/nn_model " - "/path/to/tokens.txt foo.wav [bar.wav [foobar.wav] ... ]\n"; - exit(EXIT_FAILURE); - } - std::string nn_model = argv[1]; - std::string tokens = argv[2]; - float sample_rate = 16000; - bool use_gpu = false; - - sherpa::DecodingOptions opts; - opts.method = sherpa::kGreedySearch; - sherpa::OfflineRecognizer recognizer(nn_model, tokens, opts, use_gpu, - sample_rate); - - if (argc == 4) { - std::cout << "Decode single file\n"; - auto result = recognizer.DecodeFile(argv[3]); - std::cout << argv[3] << "\n" << result.text << "\n"; - return 0; - } - - std::cout << "Decode multiple files\n"; - - std::vector filenames; - for (int i = 3; i != argc; ++i) { - filenames.push_back(argv[i]); - } - - auto results = recognizer.DecodeFileBatch(filenames); - for (size_t i = 0; i != filenames.size(); ++i) { - std::cout << filenames[i] << "\n" << results[i].text << "\n\n"; - } - return 0; -} diff --git a/sherpa/cpp_api/test_decode_samples.cc b/sherpa/cpp_api/test_decode_samples.cc deleted file mode 100644 index 6deb1b658..000000000 --- a/sherpa/cpp_api/test_decode_samples.cc +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include "kaldi_native_io/csrc/kaldi-io.h" -#include "kaldi_native_io/csrc/wave-reader.h" -#include "sherpa/cpp_api/offline_recognizer.h" -#include "torch/script.h" - -/** Read wave samples from a file. - * - * If the file has multiple channels, only the first channel is returned. - * Samples are normalized to the range [-1, 1). - * - * @param filename Path to the wave file. Only "*.wav" format is supported. - * @param expected_sample_rate Expected sample rate of the wave file. It aborts - * if the sample rate of the given file is not - * equal to this value. - * - * @return Return a 1-D torch.float32 tensor containing audio samples - * in the range [-1, 1) - */ -static torch::Tensor ReadWave(const std::string &filename, - float expected_sample_rate) { - bool binary = true; - kaldiio::Input ki(filename, &binary); - kaldiio::WaveHolder wh; - if (!wh.Read(ki.Stream())) { - std::cerr << "Failed to read " << filename; - exit(EXIT_FAILURE); - } - - auto &wave_data = wh.Value(); - if (wave_data.SampFreq() != expected_sample_rate) { - std::cerr << filename << "is expected to have sample rate " - << expected_sample_rate << ". Given " << wave_data.SampFreq(); - exit(EXIT_FAILURE); - } - - auto &d = wave_data.Data(); - - if (d.NumRows() > 1) { - std::cerr << "Only the first channel from " << filename << " is used"; - } - - auto tensor = torch::from_blob(const_cast(d.RowData(0)), - {d.NumCols()}, torch::kFloat); - - return tensor / 32768; -} - -int main(int argc, char *argv[]) { - if (argc < 4) { - std::cerr << "Usage: ./bin/test_decode_file /path/to/nn_model " - "/path/to/tokens.txt foo.wav [bar.wav [foobar.wav] ... ]\n"; - exit(EXIT_FAILURE); - } - std::string nn_model = argv[1]; - std::string tokens = argv[2]; - float sample_rate = 16000; - bool use_gpu = false; - - sherpa::DecodingOptions opts; - opts.method = sherpa::kGreedySearch; - sherpa::OfflineRecognizer recognizer(nn_model, tokens, opts, use_gpu, - sample_rate); - - if (argc == 4) { - std::cout << "Decode single file\n"; - auto tensor = ReadWave(argv[3], sample_rate); - auto result = - recognizer.DecodeSamples(tensor.data_ptr(), tensor.size(0)); - std::cout << argv[3] << "\n" << result.text << "\n"; - return 0; - } - - std::cout << "Decode multiple files\n"; - - std::vector tensors; - std::vector tensors_addr; - std::vector tensors_length; - for (int i = 3; i != argc; ++i) { - tensors.push_back(ReadWave(argv[i], sample_rate)); - tensors_addr.push_back(tensors.back().data_ptr()); - tensors_length.push_back(tensors.back().size(0)); - } - - auto results = recognizer.DecodeSamplesBatch( - tensors_addr.data(), tensors_length.data(), tensors_length.size()); - - for (size_t i = 0; i != tensors_length.size(); ++i) { - std::cout << argv[i + 3] << "\n" << results[i].text << "\n\n"; - } - return 0; -} diff --git a/sherpa/cpp_api/websocket/CMakeLists.txt b/sherpa/cpp_api/websocket/CMakeLists.txt new file mode 100644 index 000000000..6d7c94bd3 --- /dev/null +++ b/sherpa/cpp_api/websocket/CMakeLists.txt @@ -0,0 +1,144 @@ +add_definitions(-DASIO_STANDALONE) +add_definitions(-D_WEBSOCKETPP_CPP11_STL_) + +add_executable(sherpa-offline-websocket-server + offline-websocket-server.cc + offline-websocket-server-impl.cc +) +target_link_libraries(sherpa-offline-websocket-server + sherpa_cpp_api +) +if(NOT WIN32) + target_link_libraries(sherpa-offline-websocket-server -pthread) + target_compile_options(sherpa-offline-websocket-server PRIVATE -Wno-deprecated-declarations) +endif() + +add_executable(sherpa-offline-websocket-client + offline-websocket-client.cc +) + +target_link_libraries(sherpa-offline-websocket-client + sherpa_core + kaldi_native_io_core +) + +if(NOT WIN32) + target_link_libraries(sherpa-offline-websocket-client -pthread) + target_compile_options(sherpa-offline-websocket-client PRIVATE -Wno-deprecated-declarations) +endif() + +add_executable(sherpa-online-websocket-server + online-websocket-server.cc + online-websocket-server-impl.cc +) +target_link_libraries(sherpa-online-websocket-server sherpa_cpp_api) + +if(NOT WIN32) + target_link_libraries(sherpa-online-websocket-server -pthread) + target_compile_options(sherpa-online-websocket-server PRIVATE -Wno-deprecated-declarations) +endif() + +add_executable(sherpa-online-websocket-client + online-websocket-client.cc +) + +target_link_libraries(sherpa-online-websocket-client + sherpa_core + kaldi_native_io_core +) + +if(NOT WIN32) + target_link_libraries(sherpa-online-websocket-client -pthread) +endif() + +if(SHERPA_ENABLE_PORTAUDIO) + add_executable(sherpa-online-websocket-client-microphone + online-websocket-client-from-microphone.cc + microphone.cc + ) + + target_link_libraries(sherpa-online-websocket-client-microphone sherpa_core) + + if(BUILD_SHARED_LIBS) + target_link_libraries(sherpa-online-websocket-client-microphone portaudio) + else() + target_link_libraries(sherpa-online-websocket-client-microphone portaudio_static) + endif() + + if(NOT WIN32) + target_link_libraries(sherpa-online-websocket-client-microphone -pthread) + target_compile_options(sherpa-online-websocket-client-microphone PRIVATE -Wno-deprecated-declarations) + endif() +endif() + +set(bins + sherpa-offline-websocket-server + sherpa-offline-websocket-client + sherpa-online-websocket-server + sherpa-online-websocket-client +) +if(SHERPA_ENABLE_PORTAUDIO) + list(APPEND bins sherpa-online-websocket-client-microphone) +endif() + +if(NOT WIN32) + if(NOT DEFINED ENV{VIRTUAL_ENV}) + message(STATUS "Outside a virtual environment") + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "import site; print(';'.join(site.getsitepackages()))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE path_list + ) + else() + message(STATUS "Inside a virtual environment") + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR + ) + set(path_list ${PYTHON_SITE_PACKAGE_DIR}) + endif() + + message(STATUS "path list: ${path_list}") + foreach(p IN LISTS path_list) + foreach(exe IN LISTS bins) + target_link_libraries(${exe} "-Wl,-rpath,${p}/sherpa/lib") + target_link_libraries(${exe} "-Wl,-rpath,${p}/../lib") + endforeach() + endforeach() + + foreach(exe IN LISTS bins) + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_RPATH_ORIGIN}/../lib") + endforeach() + + # add additional paths + set(additional_paths + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/torch/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/torch/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/k2/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/k2/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/kaldifeat/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/kaldifeat/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/sherpa/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/sherpa/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/torch/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/torch/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/k2/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/k2/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/kaldifeat/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/kaldifeat/lib64 + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/sherpa/lib + ${SHERPA_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/dist-packages/sherpa/lib64 + ) + message(STATUS "additional_paths: ${additional_paths}") + foreach(p IN LISTS additional_paths) + foreach(exe IN LISTS bins) + target_link_libraries(${exe} "-Wl,-rpath,${p}") + target_link_libraries(${exe} "-Wl,-rpath,${p}") + endforeach() + endforeach() +endif() + +install(TARGETS ${bins} + DESTINATION bin +) diff --git a/sherpa/cpp_api/websocket/CPPLINT.cfg b/sherpa/cpp_api/websocket/CPPLINT.cfg new file mode 100644 index 000000000..d01294419 --- /dev/null +++ b/sherpa/cpp_api/websocket/CPPLINT.cfg @@ -0,0 +1 @@ +exclude_files=tee-stream.h diff --git a/sherpa/cpp_api/websocket/http-server.h b/sherpa/cpp_api/websocket/http-server.h new file mode 100644 index 000000000..350904dde --- /dev/null +++ b/sherpa/cpp_api/websocket/http-server.h @@ -0,0 +1,117 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SHERPA_CPP_API_WEBSOCKET_HTTP_SERVER_H_ +#define SHERPA_CPP_API_WEBSOCKET_HTTP_SERVER_H_ + +#include +#include +#include + +namespace sherpa { + +/** Read a text or a binary file. + * + * @param filename The file to read. + * @return Return the file content in a string. + */ +static std::string ReadFile(const std::string &filename) { + std::ifstream file(filename); + + std::string ans; + file.seekg(0, std::ios::end); + ans.reserve(file.tellg()); + file.seekg(0, std::ios::beg); + ans.assign((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + return ans; +} + +static const char *kKnownFiles[] = { + // Please sort it alphabetically + "/css/bootstrap.min.css", + "/css/bootstrap.min.css.map", + "/index.html", + "/js/bootstrap.min.js", + "/js/bootstrap.min.js.map", + "/js/jquery-3.6.0.min.js", + "/js/offline_record.js", + "/js/offline_record.js", + "/js/popper.min.js", + "/js/popper.min.js.map", + "/js/streaming_record.js", + "/js/upload.js", + "/k2-logo.png", + "/nav-partial.html", + "/offline_record.html", + "/streaming_record.html", + "/upload.html", +}; + +/** A very simple http server. + * + * It serves only static files, e.g., html, js., css, etc. + */ +class HttpServer { + public: + explicit HttpServer(const std::string &root) { + for (const auto filename : kKnownFiles) { + content_.emplace(filename, ReadFile(root + filename)); + } + + error_content_ = R"( + +Speech recognition with next-gen Kaldi +

404 ERROR! Please re-check your URL

+ + )"; + } + + /** Handle a request from the client. + * + * @param filename The filename the client is requesting. + * @param content On return, it contains the content of the file if found. + * Otherwise, it contains the 404 page. + * + * @return Return true if the given file is found; return false otherwise. + */ + bool ProcessRequest(const std::string &filename, std::string *content) const { + auto it = content_.find(filename); + if (it == content_.end()) { + *content = error_content_; + return false; + } + + *content = it->second; + return true; + } + + /** Return a string for 404. */ + const std::string &GetErrorContent() const { return error_content_; } + + private: + /**Return this string to the client for 404 page.*/ + std::string error_content_; + + /** Map filename to its content.*/ + std::unordered_map content_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_WEBSOCKET_HTTP_SERVER_H_ diff --git a/sherpa/cpp_api/websocket/microphone.cc b/sherpa/cpp_api/websocket/microphone.cc new file mode 100644 index 000000000..90f66e686 --- /dev/null +++ b/sherpa/cpp_api/websocket/microphone.cc @@ -0,0 +1,143 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sherpa/cpp_api/websocket/microphone.h" + +#include + +#include + +#include "portaudio.h" // NOLINT +#include "torch/script.h" + +namespace sherpa { + +static int RecordCallback(const void *input_buffer, void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + Microphone *mic = reinterpret_cast(user_data); + + auto samples = + torch::from_blob(static_cast(const_cast(input_buffer)), + {static_cast(frames_per_buffer)}, torch::kFloat) + .clone(); + + mic->Push(samples); + return paContinue; +} + +Microphone::Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } +} + +Microphone::~Microphone() { + PaError err = paNoError; + + if (stream_) { + err = Pa_CloseStream(stream_); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + } + + err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + if (t_.joinable()) { + t_.join(); + } +} + +void Microphone::_StartMicrophone() { + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + fprintf(stderr, "num devices: %d\n", num_devices); + + PaStreamParameters param; + + param.device = Pa_GetDefaultInputDevice(); + if (param.device == paNoDevice) { + fprintf(stderr, "No default input device found\n"); + exit(EXIT_FAILURE); + } + fprintf(stderr, "Use default device: %d\n", param.device); + + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + fprintf(stderr, " Name: %s\n", info->name); + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); + + param.channelCount = 1; + param.sampleFormat = paFloat32; + + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + + PaError err = + Pa_OpenStream(&stream_, ¶m, nullptr, /* &outputParameters, */ + sample_rate_, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, this); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } + + err = Pa_StartStream(stream_); + fprintf(stderr, "Started\n"); + + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(EXIT_FAILURE); + } +} + +void Microphone::Push(torch::Tensor samples) { + if (!samples_.defined()) { + samples_ = samples; + } else { + samples_ = torch::cat({samples_, samples}, /*dim*/ 0); + } + + // We buffer some samples to reduce the number of packets to send + if (samples_.numel() > 100) { + asio::post(c_->get_io_service(), [this, samples = std::move(samples_)]() { + int32_t num_samples = samples.numel(); + int32_t num_bytes = num_samples * sizeof(float); + websocketpp::lib::error_code ec; + c_->send(hdl_, samples.data_ptr(), num_bytes, + websocketpp::frame::opcode::binary, ec); + if (ec) { + std::cerr << "Failed to send audio samples\n"; + exit(EXIT_FAILURE); + } + }); + } +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/websocket/microphone.h b/sherpa/cpp_api/websocket/microphone.h new file mode 100644 index 000000000..7ce0545f4 --- /dev/null +++ b/sherpa/cpp_api/websocket/microphone.h @@ -0,0 +1,77 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SHERPA_CPP_API_WEBSOCKET_MICROPHONE_H_ +#define SHERPA_CPP_API_WEBSOCKET_MICROPHONE_H_ + +#include + +#include "portaudio.h" // NOLINT +#include "torch/script.h" +#include "websocketpp/client.hpp" +#include "websocketpp/config/asio_no_tls_client.hpp" + +using client = websocketpp::client; + +namespace sherpa { + +class Microphone { + public: + Microphone(); + ~Microphone(); + Microphone(const Microphone &) = delete; + Microphone &operator=(const Microphone &) = delete; + + /* Start the microphone. + * + * Once there is data available, it will invoke `Push`. + * + * @param c Responsible for sending the data. + * @param hdl Handle to the connection to the server. + */ + void StartMicrophone(client *c, websocketpp::connection_hdl hdl) { + c_ = c; + hdl_ = hdl; + + t_ = std::thread([&]() { _StartMicrophone(); }); + } + + /** Invoked by the callback of the microphone. + * + * @param samples 1-D torch.float32 tensor containing samples + * in the range [-1, 1]. + */ + void Push(torch::Tensor samples); + + private: + void _StartMicrophone(); + + private: + torch::Tensor samples_; + std::function callback_; + PaStream *stream_ = nullptr; + + float sample_rate_ = 16000; + + client *c_; + websocketpp::connection_hdl hdl_; + std::thread t_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_WEBSOCKET_MICROPHONE_H_ diff --git a/sherpa/cpp_api/websocket/offline-websocket-client.cc b/sherpa/cpp_api/websocket/offline-websocket-client.cc new file mode 100644 index 000000000..001e1ddb0 --- /dev/null +++ b/sherpa/cpp_api/websocket/offline-websocket-client.cc @@ -0,0 +1,254 @@ +// sherpa/cpp_api/websocket/offline-websocket-client.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include + +#include "kaldi_native_io/csrc/kaldi-io.h" +#include "kaldi_native_io/csrc/wave-reader.h" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/log.h" +#include "torch/script.h" +#include "websocketpp/client.hpp" +#include "websocketpp/config/asio_no_tls_client.hpp" +#include "websocketpp/uri.hpp" + +using client = websocketpp::client; + +using message_ptr = client::message_ptr; +using websocketpp::connection_hdl; + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using websocket. + +Usage: + +sherpa-offline-websocket-client --help + +sherpa-offline-websocket-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + /path/to/foo.wav +)"; + +// Sample rate of the input wave. No resampling is made. +static constexpr int32_t kSampleRate = 16000; + +/** Read wave samples from a file. + * + * If the file has multiple channels, only the first channel is returned. + * Samples are normalized to the range [-1, 1). + * + * @param filename Path to the wave file. Only "*.wav" format is supported. + * @param expected_sample_rate Expected sample rate of the wave file. It aborts + * if the sample rate of the given file is not + * equal to this value. + * + * @return Return a 1-D torch.float32 tensor containing audio samples + * in the range [-1, 1) + */ +static torch::Tensor ReadWave(const std::string &filename, + float expected_sample_rate) { + bool binary = true; + kaldiio::Input ki(filename, &binary); + kaldiio::WaveHolder wh; + if (!wh.Read(ki.Stream())) { + std::cerr << "Failed to read " << filename; + exit(EXIT_FAILURE); + } + + auto &wave_data = wh.Value(); + if (wave_data.SampFreq() != expected_sample_rate) { + std::cerr << filename << "is expected to have sample rate " + << expected_sample_rate << ". Given " << wave_data.SampFreq(); + exit(EXIT_FAILURE); + } + + auto &d = wave_data.Data(); + + if (d.NumRows() > 1) { + std::cerr << "Only the first channel from " << filename << " is used"; + } + + auto tensor = torch::from_blob(const_cast(d.RowData(0)), + {d.NumCols()}, torch::kFloat); + + return tensor / 32768; +} + +class Client { + public: + Client(asio::io_context &io, // NOLINT + const std::string &ip, int16_t port, const std::string &wave_filename, + float num_seconds_per_message) + : io_(io), + uri_(/*secure*/ false, ip, port, /*resource*/ "/"), + samples_(ReadWave(wave_filename, kSampleRate)), + samples_per_message_(num_seconds_per_message * kSampleRate) { + c_.clear_access_channels(websocketpp::log::alevel::all); + c_.set_access_channels(websocketpp::log::alevel::connect); + c_.set_access_channels(websocketpp::log::alevel::disconnect); + + c_.init_asio(&io_); + + c_.set_open_handler([this](connection_hdl hdl) { OnOpen(hdl); }); + + c_.set_close_handler( + [](connection_hdl /*hdl*/) { SHERPA_LOG(INFO) << "Disconnected"; }); + + c_.set_message_handler( + [this](connection_hdl hdl, message_ptr msg) { OnMessage(hdl, msg); }); + + Run(); + } + + private: + void Run() { + websocketpp::lib::error_code ec; + client::connection_ptr con = c_.get_connection(uri_.str(), ec); + if (ec) { + SHERPA_LOG(ERROR) << "Could not create connection to " << uri_.str() + << " because: " << ec.message() << "\n"; + exit(EXIT_FAILURE); + } + + c_.connect(con); + } + + void OnOpen(connection_hdl hdl) { + int32_t num_samples = samples_.numel(); + int32_t num_bytes = num_samples * sizeof(float); + + SHERPA_LOG(INFO) << "Sending " << num_bytes << " bytes\n"; + websocketpp::lib::error_code ec; + c_.send(hdl, &num_bytes, sizeof(int32_t), + websocketpp::frame::opcode::binary, ec); + if (ec) { + SHERPA_LOG(ERROR) << "Failed to send number of bytes because: " + << ec.message(); + exit(EXIT_FAILURE); + } + + asio::post(io_, [this, hdl]() { this->SendMessage(hdl); }); + } + + void OnMessage(connection_hdl hdl, message_ptr msg) { + SHERPA_LOG(INFO) << "Decoding results:\n" << msg->get_payload(); + + websocketpp::lib::error_code ec; + c_.send(hdl, "Done", websocketpp::frame::opcode::text, ec); + + if (ec) { + SHERPA_LOG(ERROR) << "Failed to send Done because " << ec.message(); + exit(EXIT_FAILURE); + } + + ec.clear(); + c_.close(hdl, websocketpp::close::status::normal, "I'm exiting now", ec); + if (ec) { + SHERPA_LOG(ERROR) << "Failed to close because " << ec.message(); + exit(EXIT_FAILURE); + } + } + + void SendMessage(connection_hdl hdl) { + int32_t num_samples = samples_.numel(); + int32_t num_messages = num_samples / samples_per_message_; + + websocketpp::lib::error_code ec; + + if (num_sent_messages_ < num_messages) { + SHERPA_LOG(INFO) << "Sending " << num_sent_messages_ << "/" + << num_messages << "\n"; + c_.send(hdl, + samples_.data_ptr() + + num_sent_messages_ * samples_per_message_, + samples_per_message_ * sizeof(float), + websocketpp::frame::opcode::binary, ec); + + if (ec) { + SHERPA_LOG(INFO) << "Failed to send audio samples because " + << ec.message(); + exit(EXIT_FAILURE); + } + ec.clear(); + + ++num_sent_messages_; + } + + if (num_sent_messages_ == num_messages) { + int32_t remaining_samples = num_samples % samples_per_message_; + if (remaining_samples) { + c_.send(hdl, + samples_.data_ptr() + + num_sent_messages_ * samples_per_message_, + remaining_samples * sizeof(float), + websocketpp::frame::opcode::binary, ec); + + if (ec) { + SHERPA_LOG(INFO) << "Failed to send audio samples because " + << ec.message(); + exit(EXIT_FAILURE); + } + } + } else { + asio::post(io_, [this, hdl]() { this->SendMessage(hdl); }); + } + } + + private: + client c_; + asio::io_context &io_; + websocketpp::uri uri_; + torch::Tensor samples_; + + int32_t samples_per_message_; + int32_t num_sent_messages_ = 0; +}; + +int32_t main(int32_t argc, char *argv[]) { + std::string server_ip = "127.0.0.1"; + int32_t server_port = 6006; + float num_seconds_per_message = 10; + + sherpa::ParseOptions po(kUsageMessage); + + po.Register("server-ip", &server_ip, "IP address of the websocket server"); + po.Register("server-port", &server_port, "Port of the websocket server"); + po.Register("num-seconds-per-message", &num_seconds_per_message, + "The number of samples per message equals to " + "num_seconds_per_message*sample_rate"); + + po.Read(argc, argv); + SHERPA_CHECK_GT(num_seconds_per_message, 0); + + SHERPA_CHECK_GT(static_cast(num_seconds_per_message * kSampleRate), + 0) + << "num_seconds_per_message: " << num_seconds_per_message + << ", kSampleRate: " << kSampleRate; + + if (!websocketpp::uri_helper::ipv4_literal(server_ip.begin(), + server_ip.end())) { + SHERPA_LOG(FATAL) << "Invalid server IP: " << server_ip; + } + + if (server_port <= 0 || server_port > 65535) { + SHERPA_LOG(FATAL) << "Invalid server port: " << server_port; + } + + if (po.NumArgs() != 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + std::string wave_filename = po.GetArg(1); + + asio::io_context io_conn; // for network connections + + Client c(io_conn, server_ip, server_port, wave_filename, + num_seconds_per_message); + + io_conn.run(); // will exit when the above connection is closed + + SHERPA_LOG(INFO) << "Done!"; + return 0; +} diff --git a/sherpa/cpp_api/websocket/offline-websocket-server-impl.cc b/sherpa/cpp_api/websocket/offline-websocket-server-impl.cc new file mode 100644 index 000000000..a46bd0dc8 --- /dev/null +++ b/sherpa/cpp_api/websocket/offline-websocket-server-impl.cc @@ -0,0 +1,336 @@ +// sherpa/cpp_api/websocket/offline-websocket-server-impl.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/websocket/offline-websocket-server-impl.h" + +#include +#include +#include +#include + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +void OfflineWebsocketDecoderConfig::Register(ParseOptions *po) { + recognizer_config.Register(po); + + po->Register( + "max-batch-size", &max_batch_size, + "Max batch size for decoding. If you are using CPU, increasing " + "it will increase the memory usage if there are many active " + "connections. We suggest that you use a small value for it for " + "CPU decoding, e.g., 5, since it is pretty fast for CPU decoding"); + + po->Register( + "max-utterance-length", &max_utterance_length, + "Max utterance length in seconds. If we receive an utterance " + "longer than this value, we will reject the connection. " + "If you have enough memory, you can select a large value for it."); +} + +void OfflineWebsocketDecoderConfig::Validate() const { + recognizer_config.Validate(); + + SHERPA_CHECK_GT(max_batch_size, 0); + + SHERPA_CHECK_GT(max_utterance_length, 0); +} + +OfflineWebsocketDecoder::OfflineWebsocketDecoder( + const OfflineWebsocketDecoderConfig &config, OfflineWebsocketServer *server) + : config_(config), server_(server), recognizer_(config.recognizer_config) {} + +void OfflineWebsocketDecoder::Push(connection_hdl hdl, ConnectionDataPtr d) { + std::lock_guard lock(mutex_); + streams_.push_back({hdl, d}); +} + +void OfflineWebsocketDecoder::Decode() { + std::unique_lock lock(mutex_); + if (streams_.empty()) { + return; + } + + int32_t size = + std::min(static_cast(streams_.size()), config_.max_batch_size); + + // We first lock the mutex for streams_, take items from it, and then + // unlock the mutex; in doing so we don't need to lock the mutex to + // access hdl and connection_data later. + std::vector handles(size); + + // Store connection_data here to prevent the data from being freed + // while we are still using it. + std::vector connection_data(size); + + std::vector samples(size); + std::vector samples_length(size); + std::vector> ss(size); + std::vector p_ss(size); + + for (int32_t i = 0; i != size; ++i) { + auto &p = streams_.front(); + handles[i] = p.first; + connection_data[i] = p.second; + streams_.pop_front(); + + auto samples = + reinterpret_cast(&connection_data[i]->data[0]); + auto num_samples = connection_data[i]->expected_byte_size / sizeof(float); + auto s = recognizer_.CreateStream(); + s->AcceptSamples(samples, num_samples); + + ss[i] = std::move(s); + p_ss[i] = ss[i].get(); + } + + lock.unlock(); + + // Note: DecodeStreams is thread-safe + recognizer_.DecodeStreams(p_ss.data(), size); + + for (int32_t i = 0; i != size; ++i) { + connection_hdl hdl = handles[i]; + asio::post(server_->GetConnectionContext(), + [this, hdl, text = ss[i]->GetResult().text]() { + websocketpp::lib::error_code ec; + server_->GetServer().send( + hdl, text, websocketpp::frame::opcode::text, ec); + if (ec) { + server_->GetServer().get_alog().write( + websocketpp::log::alevel::app, ec.message()); + } + }); + } +} +void OfflineWebsocketServerConfig::Register(ParseOptions *po) { + po->Register("doc-root", &doc_root, + "Path to the directory where " + "files like index.html for the HTTP server locate"); + + po->Register("log-file", &log_file, + "Path to the log file. Logs are " + "appended to this file"); +} + +void OfflineWebsocketServerConfig::Validate() const { + if (doc_root.empty()) { + SHERPA_LOG(FATAL) << "Please provide --doc-root, e.g., sherpa/bin/web"; + } + + if (!FileExists(doc_root + "/index.html")) { + SHERPA_LOG(FATAL) << "\n--doc-root=" << doc_root << "\n" + << doc_root << "/index.html does not exist!"; + } +} + +OfflineWebsocketServer::OfflineWebsocketServer( + asio::io_context &io_conn, // NOLINT + asio::io_context &io_work, // NOLINT + const OfflineWebsocketServerConfig &config, + const OfflineWebsocketDecoderConfig &decoder_config) + : io_conn_(io_conn), + io_work_(io_work), + http_server_(config.doc_root), + config_(config), + log_(config.log_file, std::ios::app), + tee_(std::cout, log_), + decoder_(decoder_config, this) { + SetupLog(); + + server_.init_asio(&io_conn_); + + server_.set_open_handler([this](connection_hdl hdl) { OnOpen(hdl); }); + + server_.set_close_handler([this](connection_hdl hdl) { OnClose(hdl); }); + + server_.set_http_handler([this](connection_hdl hdl) { OnHttp(hdl); }); + + server_.set_message_handler( + [this](connection_hdl hdl, server::message_ptr msg) { + OnMessage(hdl, msg); + }); + + auto sample_rate = decoder_config.recognizer_config.feat_config.fbank_opts + .frame_opts.samp_freq; + + max_byte_size_ = + decoder_config.max_utterance_length * sample_rate * sizeof(float); + + SHERPA_LOG(INFO) << "max_utterance_length: " + << decoder_config.max_utterance_length << " s," + << "max_byte_size_: " << max_byte_size_; +} + +void OfflineWebsocketServer::SetupLog() { + server_.clear_access_channels(websocketpp::log::alevel::all); + server_.set_access_channels(websocketpp::log::alevel::connect); + server_.set_access_channels(websocketpp::log::alevel::disconnect); + + // So that it also prints to std::cout and std::cerr + server_.get_alog().set_ostream(&tee_); + server_.get_elog().set_ostream(&tee_); +} + +void OfflineWebsocketServer::OnOpen(connection_hdl hdl) { + std::lock_guard lock(mutex_); + connections_.emplace(hdl, std::make_shared()); + + SHERPA_LOG(INFO) << "Number of active connections: " << connections_.size() + << "\n"; +} + +void OfflineWebsocketServer::OnClose(connection_hdl hdl) { + std::lock_guard lock(mutex_); + connections_.erase(hdl); + + SHERPA_LOG(INFO) << "Number of active connections: " << connections_.size() + << "\n"; +} + +void OfflineWebsocketServer::OnHttp(connection_hdl hdl) { + auto con = server_.get_con_from_hdl(hdl); + + std::string filename = con->get_resource(); + if (filename == "/") filename = "/index.html"; + + std::string content; + bool found = false; + if (filename != "/streaming_record.html") { + found = http_server_.ProcessRequest(filename, &content); + } else { + content = R"( + +Speech recognition with next-gen Kaldi +

/streaming_record.html is not available for the offline server"

; +
+
+Go back to /upload.html or +/offline_record.html + + )"; + } + + if (found) { + con->set_status(websocketpp::http::status_code::ok); + } else { + con->set_status(websocketpp::http::status_code::not_found); + } + + con->set_body(std::move(content)); +} + +void OfflineWebsocketServer::OnMessage(connection_hdl hdl, + server::message_ptr msg) { + std::unique_lock lock(mutex_); + auto connection_data = connections_.find(hdl)->second; + lock.unlock(); + const std::string &payload = msg->get_payload(); + + switch (msg->get_opcode()) { + case websocketpp::frame::opcode::text: + if (payload == "Done") { + // The client will not send any more data. We can close the + // connection now. + Close(hdl, websocketpp::close::status::normal, "Done"); + } else { + Close(hdl, websocketpp::close::status::normal, + std::string("Invalid payload: ") + payload); + } + break; + + case websocketpp::frame::opcode::binary: { + auto p = reinterpret_cast(payload.data()); + + if (connection_data->expected_byte_size == 0) { + if (payload.size() < 4) { + Close(hdl, websocketpp::close::status::normal, + "Payload is too short"); + break; + } + + // the first packet (assume the current machine is little endian) + connection_data->expected_byte_size = + *reinterpret_cast(p); + + if (connection_data->expected_byte_size > max_byte_size_) { + float num_samples = + connection_data->expected_byte_size / sizeof(float); + + auto sample_rate = decoder_.GetConfig() + .recognizer_config.feat_config.fbank_opts + .frame_opts.samp_freq; + + float duration = num_samples / sample_rate; + + std::ostringstream os; + os << "Max utterance length is configured to " + << decoder_.GetConfig().max_utterance_length + << " seconds, received length is " << duration << " seconds. " + << "Payload is too large!"; + SHERPA_LOG(INFO) << os.str(); + Close(hdl, websocketpp::close::status::message_too_big, os.str()); + break; + } + + connection_data->data.resize(connection_data->expected_byte_size); + std::copy(payload.begin() + 4, payload.end(), + connection_data->data.data()); + connection_data->cur = payload.size() - 4; + } else { + std::copy(payload.begin(), payload.end(), + connection_data->data.data() + connection_data->cur); + connection_data->cur += payload.size(); + } + + if (connection_data->expected_byte_size == connection_data->cur) { + auto d = std::make_shared(std::move(*connection_data)); + // Clear it so that we can handle the next audio file from the client. + // The client can send multiple audio files for recognition without + // the need to create another connection. + connection_data->expected_byte_size = 0; + connection_data->cur = 0; + + decoder_.Push(hdl, d); + + connection_data->Clear(); + + asio::post(io_work_, [this]() { decoder_.Decode(); }); + } + break; + } + + default: + // Unexpected message, ignore it + break; + } +} + +void OfflineWebsocketServer::Close(connection_hdl hdl, + websocketpp::close::status::value code, + const std::string &reason) { + auto con = server_.get_con_from_hdl(hdl); + + std::ostringstream os; + os << "Closing " << con->get_remote_endpoint() << " with reason: " << reason + << "\n"; + + websocketpp::lib::error_code ec; + server_.close(hdl, code, reason, ec); + if (ec) { + os << "Failed to close" << con->get_remote_endpoint() << ". " + << ec.message() << "\n"; + } + server_.get_alog().write(websocketpp::log::alevel::app, os.str()); +} + +void OfflineWebsocketServer::Run(uint16_t port) { + server_.set_reuse_addr(true); + server_.listen(asio::ip::tcp::v4(), port); + server_.start_accept(); +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/websocket/offline-websocket-server-impl.h b/sherpa/cpp_api/websocket/offline-websocket-server-impl.h new file mode 100644 index 000000000..f405fa052 --- /dev/null +++ b/sherpa/cpp_api/websocket/offline-websocket-server-impl.h @@ -0,0 +1,192 @@ +// sherpa/cpp_api/websocket/offline-websocket-server-impl.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_WEBSOCKET_OFFLINE_WEBSOCKET_SERVER_IMPL_H_ +#define SHERPA_CPP_API_WEBSOCKET_OFFLINE_WEBSOCKET_SERVER_IMPL_H_ + +#include +#include +#include +#include +#include +#include + +#include "asio.hpp" +#include "sherpa/cpp_api/offline-recognizer.h" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/cpp_api/websocket/http-server.h" +#include "sherpa/cpp_api/websocket/tee-stream.h" +#include "websocketpp/config/asio_no_tls.hpp" // TODO(fangjun): support TLS +#include "websocketpp/server.hpp" + +using server = websocketpp::server; +using connection_hdl = websocketpp::connection_hdl; + +namespace sherpa { + +struct ConnectionData { + // Number of expected bytes sent from the client + int32_t expected_byte_size = 0; + + // Number of bytes received so far + int32_t cur = 0; + + // It saves the received contents from the client + std::vector data; + + void Clear() { + expected_byte_size = 0; + cur = 0; + data.clear(); + } +}; +using ConnectionDataPtr = std::shared_ptr; + +struct OfflineWebsocketDecoderConfig { + OfflineRecognizerConfig recognizer_config; + + int32_t max_batch_size = 5; + + float max_utterance_length = 300; // seconds + + void Register(ParseOptions *po); + void Validate() const; +}; + +class OfflineWebsocketServer; + +class OfflineWebsocketDecoder { + public: + /** + * @param config Configuraion for the decoder. + * @param server Borrowed from outside. + */ + OfflineWebsocketDecoder(const OfflineWebsocketDecoderConfig &config, + OfflineWebsocketServer *server); + + /** Insert received data to the queue for decoding. + * + * @param hdl A handle to the connection. We can use it to send the result + * back to the client once it finishes decoding. + * @param d The received data + */ + void Push(connection_hdl hdl, ConnectionDataPtr d); + + /** It is called by one of the work thread. + */ + void Decode(); + + const OfflineWebsocketDecoderConfig &GetConfig() const { return config_; } + + private: + OfflineWebsocketDecoderConfig config_; + + /** When we have received all the data from the client, we put it into + * this queue, the worker threads will get items from this queue for + * decoding. + * + * Number of items to take from this queue is determined by + * `--max-batch-size`. If there are not enough items in the queue, we won't + * wait and take whatever we have for decoding. + */ + std::mutex mutex_; + std::deque> streams_; + + OfflineWebsocketServer *server_; // Not owned + OfflineRecognizer recognizer_; +}; + +struct OfflineWebsocketServerConfig { + // assume you run it inside the ./build directory. + std::string doc_root = "../sherpa/bin/web"; // root for the http server + std::string log_file = "./log.txt"; + + void Register(sherpa::ParseOptions *po); + void Validate() const; +}; + +class OfflineWebsocketServer { + public: + OfflineWebsocketServer(asio::io_context &io_conn, // NOLINT + asio::io_context &io_work, // NOLINT + const OfflineWebsocketServerConfig &config, + const OfflineWebsocketDecoderConfig &decoder_config); + + asio::io_context &GetConnectionContext() { return io_conn_; } + server &GetServer() { return server_; } + + void Run(uint16_t port); + + private: + void SetupLog(); + + // When a websocket client is connected, it will invoke this method + // (Not for HTTP) + void OnOpen(connection_hdl hdl); + + // Whena a websocket client is disconnected, it will invoke this method + void OnClose(connection_hdl hdl); + + // When a HTTP client is connected, it will invoke this method + void OnHttp(connection_hdl hdl); + + // When a message received from a websocket client, this method will + // be invoked. + // + // The protocol between the client and the server is as follows: + // + // (1) The client connects to the server + // (2) The client sends a binary message telling the server how many bytes + // it will send to the server. It contains 4-byte in little endian. + // (3) The client sends a binary message containing the audio samples. + // If there are many audio samples, the client may split it into + // multiple binary messages. + // (4) When the server receives all the samples from the client, it will + // start to decode them. Once decoded, the server sends a text message + // to the client containing the decoded results + // (5) After receiving the decoded results from the server, if the client has + // another audio file to send, it repeats (2), (3), (4) + // (6) If the client has no more audio files to decode, the client sends a + // text message containing "Done" to the server and closes the connection + // (7) The server receives a text message "Done" and closes the connection + // + // Note: + // (a) All models in icefall are trained using audio samples at sampling + // rate 16 kHz. Please send audio samples with a sampling rate matching + // the one expected by the model. + // (b) All models in icefall use features extracted from audio samples + // normalized to the range [-1, 1]. Please send normalized audio samples + // if you use models from icefall. + // (c) Only sound files with a single channel is supported + // (d) Step (2) and step (3) can be merged into one step to send bandwidth. + // (e) Only audio samples are sent. For instance, if we want to decode + // a WAVE file, the header of the WAVE is not sent. + void OnMessage(connection_hdl hdl, server::message_ptr msg); + + // Close a websocket connection with given code and reason + void Close(connection_hdl hdl, websocketpp::close::status::value code, + const std::string &reason); + + private: + asio::io_context &io_conn_; + asio::io_context &io_work_; + HttpServer http_server_; + server server_; + + std::map> + connections_; + std::mutex mutex_; + + OfflineWebsocketServerConfig config_; + + std::ofstream log_; + sherpa::TeeStream tee_; + + OfflineWebsocketDecoder decoder_; + int32_t max_byte_size_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_WEBSOCKET_OFFLINE_WEBSOCKET_SERVER_IMPL_H_ diff --git a/sherpa/cpp_api/websocket/offline-websocket-server.cc b/sherpa/cpp_api/websocket/offline-websocket-server.cc new file mode 100644 index 000000000..ed9c68216 --- /dev/null +++ b/sherpa/cpp_api/websocket/offline-websocket-server.cc @@ -0,0 +1,141 @@ +// sherpa/cpp_api/websocket/offline-websocket-server.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include // NOLINT + +#include "asio.hpp" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/cpp_api/websocket/offline-websocket-server-impl.h" +#include "sherpa/csrc/log.h" +#include "torch/all.h" + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using websocket. + +Usage: + +sherpa-offline-websocket-server --help + +sherpa-offline-websocket-server \ + --use-gpu=false \ + --port=6006 \ + --num-io-threads=3 \ + --num-work-threads=5 \ + --max-batch-size=5 \ + --nn-model=/path/to/cpu.jit \ + --tokens=/path/to/tokens.txt \ + --decoding-method=greedy_search \ + --max-utterance-length=300 \ + --doc-root=../sherpa/bin/web \ + --log-file=./log.txt +)"; + +int32_t main(int32_t argc, char *argv[]) { + torch::set_num_threads(1); + torch::set_num_interop_threads(1); + sherpa::InferenceMode no_grad; + + torch::jit::getExecutorMode() = false; + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(false); + + sherpa::ParseOptions po(kUsageMessage); + + sherpa::OfflineWebsocketServerConfig config; + sherpa::OfflineWebsocketDecoderConfig decoder_config; + + // the server will listen on this port, for both websocket and http + int32_t port = 6006; + + // size of the thread pool for handling network connections + int32_t num_io_threads = 3; + + // size of the thread pool for neural network computation and decoding + int32_t num_work_threads = 5; + + po.Register("num-io-threads", &num_io_threads, + "Number of threads to use for network connections."); + + po.Register("num-work-threads", &num_work_threads, + "Number of threads to use for neural network " + "computation and decoding."); + + po.Register("port", &port, "The port on which the server will listen."); + + config.Register(&po); + decoder_config.Register(&po); + + if (argc == 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + po.Read(argc, argv); + + if (po.NumArgs() != 0) { + SHERPA_LOG(ERROR) << "Unrecognized positional arguments!"; + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + config.Validate(); + decoder_config.Validate(); + + asio::io_context io_conn; // for network connections + asio::io_context io_work; // for neural network and decoding + + sherpa::OfflineWebsocketServer server(io_conn, io_work, config, + decoder_config); + server.Run(port); + + SHERPA_LOG(INFO) << "Listening on: " << port << "\n"; + SHERPA_LOG(INFO) << "Number of I/O threads: " << num_io_threads << "\n"; + SHERPA_LOG(INFO) << "Number of work threads: " << num_work_threads << "\n"; + + // give some work to do for the io_work pool + auto work_guard = asio::make_work_guard(io_work); + + std::vector io_threads; + + // decrement since the main thread is also used for network communications + for (int32_t i = 0; i < num_io_threads - 1; ++i) { + io_threads.emplace_back([&io_conn]() { io_conn.run(); }); + } + + std::vector work_threads; + for (int32_t i = 0; i < num_work_threads; ++i) { + work_threads.emplace_back([&io_work]() { io_work.run(); }); + } + + // Print a message telling users how to access the HTTP service + std::ostringstream os; + os << "\nPlease access the HTTP server using the following address: \n\n"; + os << "http://localhost:" << port << "\n"; + os << "http://127.0.0.1:" << port << "\n"; + + asio::ip::tcp::resolver resolver(io_conn); + auto iter = resolver.resolve(asio::ip::host_name(), ""); + asio::ip::tcp::resolver::iterator end; + for (; iter != end; ++iter) { + asio::ip::tcp::endpoint ep = *iter; + asio::error_code ec; + os << "http://" << ep.address().to_string(ec) << ":" << port << "\n"; + if (ec) { + std::cout << "Error message: " << ec << "\n"; + } + } + SHERPA_LOG(INFO) << os.str(); + + io_conn.run(); + + for (auto &t : io_threads) { + t.join(); + } + + for (auto &t : work_threads) { + t.join(); + } + + return 0; +} diff --git a/sherpa/cpp_api/websocket/online-websocket-client-from-microphone.cc b/sherpa/cpp_api/websocket/online-websocket-client-from-microphone.cc new file mode 100644 index 000000000..7e3b514aa --- /dev/null +++ b/sherpa/cpp_api/websocket/online-websocket-client-from-microphone.cc @@ -0,0 +1,123 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/cpp_api/websocket/microphone.h" +#include "sherpa/csrc/log.h" +#include "torch/script.h" +#include "websocketpp/client.hpp" +#include "websocketpp/config/asio_no_tls_client.hpp" +#include "websocketpp/uri.hpp" + +using client = websocketpp::client; + +using message_ptr = client::message_ptr; +using websocketpp::connection_hdl; + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using websocket. + +Usage: + +./bin/sherpa-online-websocket-client-microphone --help + +./bin/sherpa-online-websocket-client-microphone \ + --server-ip=127.0.0.1 \ + --server-port=6006 +)"; + +static void OnMessage(client *c, connection_hdl hdl, message_ptr msg) { + static std::string last; + const std::string &payload = msg->get_payload(); + if (payload == "Done") { + websocketpp::lib::error_code ec; + c->close(hdl, websocketpp::close::status::normal, "I'm exiting now", ec); + if (ec) { + std::cerr << "Failed to close\n"; + exit(EXIT_FAILURE); + } + } else if (payload.size() != last.size() || payload != last) { + SHERPA_LOG(INFO) << payload; + last = payload; + } +} + +int32_t main(int32_t argc, char *argv[]) { + std::string server_ip = "127.0.0.1"; + int32_t server_port = 6006; + + sherpa::ParseOptions po(kUsageMessage); + + po.Register("server-ip", &server_ip, "IP address of the websocket server"); + po.Register("server-port", &server_port, "Port of the websocket server"); + + if (argc == 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + po.Read(argc, argv); + + if (!websocketpp::uri_helper::ipv4_literal(server_ip.begin(), + server_ip.end())) { + SHERPA_LOG(FATAL) << "Invalid server IP: " << server_ip; + } + + if (server_port <= 0 || server_port > 65535) { + SHERPA_LOG(FATAL) << "Invalid server port: " << server_port; + } + + if (po.NumArgs() != 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + bool secure = false; + std::string resource = "/"; + websocketpp::uri uri(secure, server_ip, server_port, resource); + + client c; + + c.clear_access_channels(websocketpp::log::alevel::all); + c.set_access_channels(websocketpp::log::alevel::connect); + c.set_access_channels(websocketpp::log::alevel::disconnect); + + c.init_asio(); + sherpa::Microphone mic; + + c.set_open_handler( + [&c, &mic](connection_hdl hdl) { mic.StartMicrophone(&c, hdl); }); + + c.set_message_handler( + [&c](connection_hdl hdl, message_ptr msg) { OnMessage(&c, hdl, msg); }); + + websocketpp::lib::error_code ec; + client::connection_ptr con = c.get_connection(uri.str(), ec); + if (ec) { + std::cerr << "Could not create connection to " << uri.str() + << " because: " << ec.message() << "\n"; + exit(EXIT_FAILURE); + } + c.connect(con); + + c.run(); // will exit when the above connection is closed + + SHERPA_LOG(INFO) << "Done!"; + return 0; +} diff --git a/sherpa/cpp_api/websocket/online-websocket-client.cc b/sherpa/cpp_api/websocket/online-websocket-client.cc new file mode 100644 index 000000000..9573589e3 --- /dev/null +++ b/sherpa/cpp_api/websocket/online-websocket-client.cc @@ -0,0 +1,339 @@ +// sherpa/cpp_api/websocket/online-websocket-client.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include // NOLINT +#include +#include + +#include "kaldi_native_io/csrc/kaldi-io.h" +#include "kaldi_native_io/csrc/wave-reader.h" +#include "nlohmann/json.hpp" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/log.h" +#include "torch/script.h" +#include "websocketpp/client.hpp" +#include "websocketpp/config/asio_no_tls_client.hpp" +#include "websocketpp/uri.hpp" + +using json = nlohmann::json; +using client = websocketpp::client; + +using message_ptr = client::message_ptr; +using websocketpp::connection_hdl; + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using websocket. + +Usage: + +sherpa-online-websocket-client --help + +sherpa-online-websocket-client \ + --server-ip=127.0.0.1 \ + --server-port=6006 \ + /path/to/foo.wav +)"; + +/** Read wave samples from a file. + * + * If the file has multiple channels, only the first channel is returned. + * Samples are normalized to the range [-1, 1). + * + * @param filename Path to the wave file. Only "*.wav" format is supported. + * @param expected_sample_rate Expected sample rate of the wave file. It aborts + * if the sample rate of the given file is not + * equal to this value. + * + * @return Return a 1-D torch.float32 tensor containing audio samples + * in the range [-1, 1) + */ +static torch::Tensor ReadWave(const std::string &filename, + float expected_sample_rate) { + bool binary = true; + kaldiio::Input ki(filename, &binary); + kaldiio::WaveHolder wh; + if (!wh.Read(ki.Stream())) { + std::cerr << "Failed to read " << filename; + exit(EXIT_FAILURE); + } + std::cout << filename; + auto &wave_data = wh.Value(); + if (wave_data.SampFreq() != expected_sample_rate) { + std::cerr << filename << "is expected to have sample rate " + << expected_sample_rate << ". Given " << wave_data.SampFreq(); + exit(EXIT_FAILURE); + } + + auto &d = wave_data.Data(); + + if (d.NumRows() > 1) { + std::cerr << "Only the first channel from " << filename << " is used"; + } + + auto tensor = torch::from_blob(const_cast(d.RowData(0)), + {d.NumCols()}, torch::kFloat); + + return tensor / 32768; +} + +class Client { + public: + Client(asio::io_context &io, // NOLINT + const std::string &ip, int16_t port, const std::string &wave_filename, + float seconds_per_message, int32_t SampleRate, + std::string ctm_filename) + : io_(io), + uri_(/*secure*/ false, ip, port, /*resource*/ "/"), + samples_(ReadWave(wave_filename, SampleRate)), + samples_per_message_(seconds_per_message * SampleRate), + seconds_per_message_(seconds_per_message), + ctm_filename_(ctm_filename) { + c_.clear_access_channels(websocketpp::log::alevel::all); + // c_.set_access_channels(websocketpp::log::alevel::connect); + // c_.set_access_channels(websocketpp::log::alevel::disconnect); + of_ = std::ofstream(ctm_filename); + of_ << std::fixed << std::setprecision(2); + std::string base_filename = + wave_filename.substr(wave_filename.find_last_of("/\\") + 1); + wave_filename_ = base_filename.substr(0, base_filename.find_last_of('.')); + + c_.init_asio(&io_); + c_.set_open_handler([this](connection_hdl hdl) { OnOpen(hdl); }); + c_.set_close_handler( + [this](connection_hdl /*hdl*/) { SHERPA_LOG(INFO) << "Disconnected"; }); + c_.set_message_handler( + [this](connection_hdl hdl, message_ptr msg) { OnMessage(hdl, msg); }); + + Run(); + } + + private: + void Run() { + websocketpp::lib::error_code ec; + client::connection_ptr con = c_.get_connection(uri_.str(), ec); + if (ec) { + SHERPA_LOG(ERROR) << "Could not create connection to " << uri_.str() + << " because: " << ec.message() << "\n"; + exit(EXIT_FAILURE); + } + + c_.connect(con); + } + + void DumpCtm(nlohmann::json result) { + int i = 0; + std::vector tokens = + result["tokens"].get>(); + int length = tokens.size(); + if (length < 1) { + return; + } + std::vector timestamps = + result["timestamps"].get>(); + if (tokens[0].at(0) != ' ') { + SHERPA_LOG(WARNING) << "First word is not a new word " << tokens[0]; + } + + std::string word = tokens[0]; + float start_time = result["start_time"]; + float start = timestamps[0] + start_time; + float duration = 0.01; + if (length > 2) { + duration = timestamps[1] - timestamps[0]; + } + int word_start_index = i; + while (i < length) { + // SHERPA_LOG(INFO) < i + 2) { + duration = timestamps[i + 2] - timestamps[word_start_index]; + } + i++; + } + if (word.compare(" ") != 0) { + of_ << wave_filename_ << " 0 " << start << " " << duration << " " + << word << std::endl; + } + if (i >= length - 1) { + break; + } + i++; + word_start_index = i; + word = tokens[i]; + start = timestamps[i] + start_time; + duration = 0.01; + if (length > i + 1) { + duration = timestamps[i + 1] - timestamps[word_start_index]; + } + } + } + + void OnOpen(connection_hdl hdl) { + auto start_time = std::chrono::steady_clock::now(); + asio::post( + io_, [this, hdl, start_time]() { this->SendMessage(hdl, start_time); }); + } + + void OnMessage(connection_hdl hdl, message_ptr msg) { + const std::string &payload = msg->get_payload(); + auto result = json::parse(payload); + std::string res = result.dump(); + SHERPA_LOG(INFO) << res; + if (result["segment"] > segment_id_) { + segment_id_ = result["segment"]; + std::cout << text_; + if (ctm_filename_.length() > 0) { + DumpCtm(old_result_); + } + } + text_ = result["text"].get(); + old_result_ = result; + if (result["final"]) { + std::cout << result["text"].get() << std::endl; + if (ctm_filename_.length() > 0) { + DumpCtm(result); + } + websocketpp::lib::error_code ec; + c_.close(hdl, websocketpp::close::status::normal, "I'm exiting now", ec); + if (ec) { + SHERPA_LOG(INFO) << "Failed to close because " << ec.message(); + exit(EXIT_FAILURE); + } + } + } + + void SendMessage( + connection_hdl hdl, + std::chrono::time_point start_time) { + int32_t num_samples = samples_.numel(); + int32_t num_messages = num_samples / samples_per_message_; + + websocketpp::lib::error_code ec; + auto time = std::chrono::steady_clock::now(); + int elapsed_time_ms = + std::chrono::duration_cast(time - start_time) + .count(); + if (elapsed_time_ms < + static_cast(seconds_per_message_ * num_sent_messages_ * 1000)) { + std::this_thread::sleep_for(std::chrono::milliseconds(int( + seconds_per_message_ * num_sent_messages_ * 1000 - elapsed_time_ms))); + } + if (num_sent_messages_ < 1) { + SHERPA_LOG(INFO) << "Starting to send audio"; + } + if (num_sent_messages_ < num_messages) { + // SHERPA_LOG(DEBUG) << "Sending " << num_sent_messages_ << "/" + // << num_messages << "\n"; + c_.send(hdl, + samples_.data_ptr() + + num_sent_messages_ * samples_per_message_, + samples_per_message_ * sizeof(float), + websocketpp::frame::opcode::binary, ec); + + if (ec) { + SHERPA_LOG(INFO) << "Failed to send audio samples because " + << ec.message(); + exit(EXIT_FAILURE); + } + ec.clear(); + + ++num_sent_messages_; + } + + if (num_sent_messages_ == num_messages) { + int32_t remaining_samples = num_samples % samples_per_message_; + if (remaining_samples) { + c_.send(hdl, + samples_.data_ptr() + + num_sent_messages_ * samples_per_message_, + remaining_samples * sizeof(float), + websocketpp::frame::opcode::binary, ec); + + if (ec) { + SHERPA_LOG(INFO) << "Failed to send audio samples because " + << ec.message(); + exit(EXIT_FAILURE); + } + ec.clear(); + } + c_.send(hdl, "Done", websocketpp::frame::opcode::text, ec); + SHERPA_LOG(INFO) << "Sent Done Signal"; + if (ec) { + SHERPA_LOG(INFO) << "Failed to send Done because " << ec.message(); + exit(EXIT_FAILURE); + } + } else { + asio::post(io_, [this, hdl, start_time]() { + this->SendMessage(hdl, start_time); + }); + } + } + + private: + client c_; + asio::io_context &io_; + websocketpp::uri uri_; + torch::Tensor samples_; + nlohmann::json old_result_; + int32_t samples_per_message_; + int32_t num_sent_messages_ = 0; + float seconds_per_message_; + int32_t segment_id_ = 0; + std::string text_; + std::string wave_filename_; + std::string ctm_filename_; + std::ofstream of_; +}; + +int32_t main(int32_t argc, char *argv[]) { + std::string server_ip = "127.0.0.1"; + int32_t server_port = 6006; + float seconds_per_message = 10; + // Sample rate of the input wave. No resampling is made. + int32_t SampleRate = 16000; + std::string ctm_filename = ""; + + sherpa::ParseOptions po(kUsageMessage); + + po.Register("server-ip", &server_ip, "IP address of the websocket server"); + po.Register("server-port", &server_port, "Port of the websocket server"); + po.Register("samplerate", &SampleRate, + "SampleRate of the recorded audio (expecting wav, no resampling " + "is done)"); + po.Register("num-seconds-per-message", &seconds_per_message, + "The number of samples per message equals to " + "seconds_per_message*sample_rate"); + po.Register("ctm-filename", &ctm_filename, "Name of the CTM output file"); + + po.Read(argc, argv); + SHERPA_CHECK_GT(seconds_per_message, 0); + SHERPA_CHECK_GT(static_cast(seconds_per_message * SampleRate), 0) + << "seconds_per_message: " << seconds_per_message + << ", SampleRate: " << SampleRate; + + if (!websocketpp::uri_helper::ipv4_literal(server_ip.begin(), + server_ip.end())) { + SHERPA_LOG(FATAL) << "Invalid server IP: " << server_ip; + } + + if (server_port <= 0 || server_port > 65535) { + SHERPA_LOG(FATAL) << "Invalid server port: " << server_port; + } + + if (po.NumArgs() != 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + std::string wave_filename = po.GetArg(1); + + asio::io_context io_conn; // for network connections + Client c(io_conn, server_ip, server_port, wave_filename, seconds_per_message, + SampleRate, ctm_filename); + + io_conn.run(); // will exit when the above connection is closed + + SHERPA_LOG(INFO) << "Done!"; + return 0; +} diff --git a/sherpa/cpp_api/websocket/online-websocket-server-impl.cc b/sherpa/cpp_api/websocket/online-websocket-server-impl.cc new file mode 100644 index 000000000..e65227bd4 --- /dev/null +++ b/sherpa/cpp_api/websocket/online-websocket-server-impl.cc @@ -0,0 +1,370 @@ +// sherpa/cpp_api/websocket/online-websocket-server-impl.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/websocket/online-websocket-server-impl.h" + +#include + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +void OnlineWebsocketDecoderConfig::Register(ParseOptions *po) { + recognizer_config.Register(po); + + po->Register("loop-interval-ms", &loop_interval_ms, + "It determines how often the decoder loop runs. "); + + po->Register("max-batch-size", &max_batch_size, + "Max batch size for recognition."); +} + +void OnlineWebsocketDecoderConfig::Validate() const { + recognizer_config.Validate(); + SHERPA_CHECK_GT(loop_interval_ms, 0); + SHERPA_CHECK_GT(max_batch_size, 0); +} + +void OnlineWebsocketServerConfig::Register(sherpa::ParseOptions *po) { + decoder_config.Register(po); + po->Register("doc-root", &doc_root, + "Path to the directory where " + "files like index.html for the HTTP server locate."); + + po->Register("log-file", &log_file, + "Path to the log file. Logs are " + "appended to this file"); +} + +void OnlineWebsocketServerConfig::Validate() const { + decoder_config.Validate(); + + if (doc_root.empty()) { + SHERPA_LOG(FATAL) << "Please provide --doc-root, e.g., sherpa/bin/web"; + } + + if (!FileExists(doc_root + "/index.html")) { + SHERPA_LOG(FATAL) << "\n--doc-root=" << doc_root << "\n" + << doc_root << "/index.html does not exist!\n" + << "Make sure that you use sherpa/bin/web/ as --doc-root"; + } +} + +OnlineWebsocketDecoder::OnlineWebsocketDecoder(OnlineWebsocketServer *server) + : server_(server), + config_(server->GetConfig().decoder_config), + timer_(server->GetWorkContext()) { + recognizer_ = std::make_unique(config_.recognizer_config); +} + +std::shared_ptr OnlineWebsocketDecoder::GetOrCreateConnection( + connection_hdl hdl) { + std::lock_guard lock(mutex_); + auto it = connections_.find(hdl); + if (it != connections_.end()) { + return it->second; + } else { + // create a new connection + std::shared_ptr s = recognizer_->CreateStream(); + auto c = std::make_shared(hdl, s); + connections_.insert({hdl, c}); + return c; + } +} + +void OnlineWebsocketDecoder::AcceptWaveform(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + float sample_rate = + config_.recognizer_config.feat_config.fbank_opts.frame_opts.samp_freq; + while (!c->samples.empty()) { + c->s->AcceptWaveform(sample_rate, c->samples.front()); + c->samples.pop_front(); + } +} + +void OnlineWebsocketDecoder::InputFinished(std::shared_ptr c) { + std::lock_guard lock(c->mutex); + + float sample_rate = + config_.recognizer_config.feat_config.fbank_opts.frame_opts.samp_freq; + + while (!c->samples.empty()) { + c->s->AcceptWaveform(sample_rate, c->samples.front()); + c->samples.pop_front(); + } + + // TODO(fangjun): Change the amount of paddings to be configurable + torch::Tensor tail_padding = + torch::zeros({static_cast(0.8 * sample_rate)}).to(torch::kFloat); + + c->s->AcceptWaveform(sample_rate, tail_padding); + + c->s->InputFinished(); +} + +void OnlineWebsocketDecoder::Run() { + timer_.expires_after(std::chrono::milliseconds(config_.loop_interval_ms)); + + timer_.async_wait( + [this](const asio::error_code &ec) { ProcessConnections(ec); }); +} + +void OnlineWebsocketDecoder::ProcessConnections(const asio::error_code &ec) { + if (ec) { + SHERPA_LOG(FATAL) << "The decoder loop is aborted!"; + } + + std::lock_guard lock(mutex_); + std::vector to_remove; + for (auto &p : connections_) { + auto hdl = p.first; + auto c = p.second; + + // The order of `if` below matters! + if (!server_->Contains(hdl)) { + // If the connection is disconnected, we stop processing it + to_remove.push_back(hdl); + continue; + } + + if (active_.count(hdl)) { + // Another thread is decoding this stream, so skip it + continue; + } + + if (!recognizer_->IsReady(c->s.get())) { + // this stream has not enough frames to decode, so skip it + continue; + } + + // TODO(fangun): If the connection is timed out, we need to also + // add it to `to_remove` + + // this stream has enough frames and is currently not processed by any + // threads, so put it into the ready queue + ready_connections_.push_back(c); + + // In `Decode()`, it will remove hdl from `active_` + active_.insert(c->hdl); + } + + for (auto hdl : to_remove) { + connections_.erase(hdl); + } + + if (!ready_connections_.empty()) { + asio::post(server_->GetWorkContext(), [this]() { Decode(); }); + } + + // Schedule another call + timer_.expires_after(std::chrono::milliseconds(config_.loop_interval_ms)); + + timer_.async_wait( + [this](const asio::error_code &ec) { ProcessConnections(ec); }); +} + +void OnlineWebsocketDecoder::Decode() { + std::unique_lock lock(mutex_); + if (ready_connections_.empty()) { + // There are no connections that are ready for decoding, + // so we return directly + return; + } + + std::vector> c_vec; + std::vector s_vec; + while (!ready_connections_.empty() && + static_cast(s_vec.size()) < config_.max_batch_size) { + auto c = ready_connections_.front(); + ready_connections_.pop_front(); + + c_vec.push_back(c); + s_vec.push_back(c->s.get()); + } + + if (!ready_connections_.empty()) { + // there are too many ready connections but this thread can only handle + // max_batch_size connections at a time, so we schedule another call + // to Decode() and let other threads to process the ready connections + asio::post(server_->GetWorkContext(), [this]() { Decode(); }); + } + + lock.unlock(); + recognizer_->DecodeStreams(s_vec.data(), s_vec.size()); + lock.lock(); + + for (auto c : c_vec) { + auto result = recognizer_->GetResult(c->s.get()); + + asio::post(server_->GetConnectionContext(), + [this, hdl = c->hdl, json = result.AsJsonString()]() { + server_->Send(hdl, json); + }); + active_.erase(c->hdl); + } +} + +OnlineWebsocketServer::OnlineWebsocketServer( + asio::io_context &io_conn, asio::io_context &io_work, + const OnlineWebsocketServerConfig &config) + : config_(config), + io_conn_(io_conn), + io_work_(io_work), + http_server_(config.doc_root), + log_(config.log_file, std::ios::app), + tee_(std::cout, log_), + decoder_(this) { + SetupLog(); + + server_.init_asio(&io_conn_); + + server_.set_open_handler([this](connection_hdl hdl) { OnOpen(hdl); }); + + server_.set_close_handler([this](connection_hdl hdl) { OnClose(hdl); }); + + server_.set_http_handler([this](connection_hdl hdl) { OnHttp(hdl); }); + + server_.set_message_handler( + [this](connection_hdl hdl, server::message_ptr msg) { + OnMessage(hdl, msg); + }); +} + +void OnlineWebsocketServer::Run(uint16_t port) { + server_.set_reuse_addr(true); + server_.listen(asio::ip::tcp::v4(), port); + server_.start_accept(); + decoder_.Run(); +} + +void OnlineWebsocketServer::SetupLog() { + server_.clear_access_channels(websocketpp::log::alevel::all); + // server_.set_access_channels(websocketpp::log::alevel::connect); + // server_.set_access_channels(websocketpp::log::alevel::disconnect); + + // So that it also prints to std::cout and std::cerr + server_.get_alog().set_ostream(&tee_); + server_.get_elog().set_ostream(&tee_); +} + +void OnlineWebsocketServer::Send(connection_hdl hdl, const std::string &text) { + websocketpp::lib::error_code ec; + if (!Contains(hdl)) { + return; + } + + server_.send(hdl, text, websocketpp::frame::opcode::text, ec); + if (ec) { + server_.get_alog().write(websocketpp::log::alevel::app, ec.message()); + } +} + +void OnlineWebsocketServer::OnOpen(connection_hdl hdl) { + std::lock_guard lock(mutex_); + connections_.insert(hdl); + + std::ostringstream os; + os << "New connection: " + << server_.get_con_from_hdl(hdl)->get_remote_endpoint() << ". " + << "Number of active connections: " << connections_.size() << ".\n"; + SHERPA_LOG(INFO) << os.str(); +} + +void OnlineWebsocketServer::OnClose(connection_hdl hdl) { + std::lock_guard lock(mutex_); + connections_.erase(hdl); + + SHERPA_LOG(INFO) << "Number of active connections: " << connections_.size() + << "\n"; +} + +bool OnlineWebsocketServer::Contains(connection_hdl hdl) const { + std::lock_guard lock(mutex_); + return connections_.count(hdl); +} + +void OnlineWebsocketServer::OnHttp(connection_hdl hdl) { + auto con = server_.get_con_from_hdl(hdl); + + std::string filename = con->get_resource(); + if (filename == "/") filename = "/index.html"; + + std::string content; + bool found = false; + + if (filename != "/upload.html" && filename != "/offline_record.html") { + found = http_server_.ProcessRequest(filename, &content); + } else { + content = R"( + +Speech recognition with next-gen Kaldi +

Only /streaming_record.html is available for the online server.

+
+
+Go back to /streaming_record.html + + )"; + } + + if (found) { + con->set_status(websocketpp::http::status_code::ok); + } else { + con->set_status(websocketpp::http::status_code::not_found); + } + + con->set_body(std::move(content)); +} + +void OnlineWebsocketServer::OnMessage(connection_hdl hdl, + server::message_ptr msg) { + auto c = decoder_.GetOrCreateConnection(hdl); + + const std::string &payload = msg->get_payload(); + + switch (msg->get_opcode()) { + case websocketpp::frame::opcode::text: + if (payload == "Done") { + asio::post(io_work_, [this, c]() { decoder_.InputFinished(c); }); + } + break; + case websocketpp::frame::opcode::binary: { + auto p = reinterpret_cast(payload.data()); + int32_t num_samples = payload.size() / sizeof(float); + torch::Tensor samples = torch::from_blob(const_cast(p), + {num_samples}, torch::kFloat); + // Caution(fangjun): We have to make a copy here since the tensor + // is referenced inside the fbank computer. + // Otherwise, it will cause segfault for the next invocation + // of AcceptWaveform since payload is freed after this function returns + samples = samples.clone(); + c->samples.push_back(samples); + + asio::post(io_work_, [this, c]() { decoder_.AcceptWaveform(c); }); + break; + } + default: + break; + } +} + +void OnlineWebsocketServer::Close(connection_hdl hdl, + websocketpp::close::status::value code, + const std::string &reason) { + auto con = server_.get_con_from_hdl(hdl); + + std::ostringstream os; + os << "Closing " << con->get_remote_endpoint() << " with reason: " << reason + << "\n"; + + websocketpp::lib::error_code ec; + server_.close(hdl, code, reason, ec); + if (ec) { + os << "Failed to close" << con->get_remote_endpoint() << ". " + << ec.message() << "\n"; + } + server_.get_alog().write(websocketpp::log::alevel::app, os.str()); +} + +} // namespace sherpa diff --git a/sherpa/cpp_api/websocket/online-websocket-server-impl.h b/sherpa/cpp_api/websocket/online-websocket-server-impl.h new file mode 100644 index 000000000..bd91cbe5f --- /dev/null +++ b/sherpa/cpp_api/websocket/online-websocket-server-impl.h @@ -0,0 +1,180 @@ +// sherpa/cpp_api/websocket/online-websocket-server-impl.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CPP_API_WEBSOCKET_ONLINE_WEBSOCKET_SERVER_IMPL_H_ +#define SHERPA_CPP_API_WEBSOCKET_ONLINE_WEBSOCKET_SERVER_IMPL_H_ + +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include + +#include "asio.hpp" +#include "sherpa/cpp_api/online-recognizer.h" +#include "sherpa/cpp_api/online-stream.h" +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/cpp_api/websocket/http-server.h" +#include "sherpa/cpp_api/websocket/tee-stream.h" +#include "websocketpp/config/asio_no_tls.hpp" // TODO(fangjun): support TLS +#include "websocketpp/server.hpp" +using server = websocketpp::server; +using connection_hdl = websocketpp::connection_hdl; + +namespace sherpa { + +struct Connection { + // handle to the connection. We can use it to send messages to the client + connection_hdl hdl; + std::shared_ptr s; + + // The last time we received a message from the client + // TODO(fangjun): Use it to disconnect from a client if it is inactive + // for a specified time. + std::chrono::steady_clock::time_point last_active; + + std::mutex mutex; // protect sampels + + // Audio samples received from the client. + // + // The I/O threads receive audio samples into this queue + // and invoke work threads to compute features + std::deque samples; + + Connection() = default; + Connection(connection_hdl hdl, std::shared_ptr s) + : hdl(hdl), s(s), last_active(std::chrono::steady_clock::now()) {} +}; + +struct OnlineWebsocketDecoderConfig { + OnlineRecognizerConfig recognizer_config; + + // It determines how often the decoder loop runs. + int32_t loop_interval_ms = 10; + + int32_t max_batch_size = 5; + + void Register(ParseOptions *po); + void Validate() const; +}; + +class OnlineWebsocketServer; + +class OnlineWebsocketDecoder { + public: + /** + * @param server Not owned. + */ + explicit OnlineWebsocketDecoder(OnlineWebsocketServer *server); + + std::shared_ptr GetOrCreateConnection(connection_hdl hdl); + + // Compute features for a stream given audio samples + void AcceptWaveform(std::shared_ptr c); + + // signal that there will be no more audio samples for a stream + void InputFinished(std::shared_ptr c); + + void Run(); + + private: + void ProcessConnections(const asio::error_code &ec); + + /** It is called by one of the worker thread. + */ + void Decode(); + + private: + OnlineWebsocketServer *server_; // not owned + std::unique_ptr recognizer_; + OnlineWebsocketDecoderConfig config_; + asio::steady_timer timer_; + + // It protects `connections_`, `ready_connections_`, and `active_` + std::mutex mutex_; + + std::map, + std::owner_less> + connections_; + + // Whenever a connection has enough feature frames for decoding, we put + // it in this queue + std::deque> ready_connections_; + + // If we are decoding a stream, we put it in the active_ set so that + // only one thread can decode a stream at a time. + std::set> active_; +}; + +struct OnlineWebsocketServerConfig { + OnlineWebsocketDecoderConfig decoder_config; + + // assume you run it inside the ./build directory. + std::string doc_root = "../sherpa/bin/web"; // root for the http server + std::string log_file = "./log.txt"; + + void Register(sherpa::ParseOptions *po); + void Validate() const; +}; + +class OnlineWebsocketServer { + public: + explicit OnlineWebsocketServer(asio::io_context &io_conn, // NOLINT + asio::io_context &io_work, // NOLINT + const OnlineWebsocketServerConfig &config); + + void Run(uint16_t port); + + const OnlineWebsocketServerConfig &GetConfig() const { return config_; } + asio::io_context &GetConnectionContext() { return io_conn_; } + asio::io_context &GetWorkContext() { return io_work_; } + server &GetServer() { return server_; } + + void Send(connection_hdl hdl, const std::string &text); + + bool Contains(connection_hdl hdl) const; + + private: + void SetupLog(); + + // When a websocket client is connected, it will invoke this method + // (Not for HTTP) + void OnOpen(connection_hdl hdl); + + // When a websocket client is disconnected, it will invoke this method + void OnClose(connection_hdl hdl); + + // When a HTTP client is connected, it will invoke this method + void OnHttp(connection_hdl hdl); + + void OnMessage(connection_hdl hdl, server::message_ptr msg); + + // Close a websocket connection with given code and reason + void Close(connection_hdl hdl, websocketpp::close::status::value code, + const std::string &reason); + + private: + OnlineWebsocketServerConfig config_; + asio::io_context &io_conn_; + asio::io_context &io_work_; + HttpServer http_server_; + server server_; + + std::ofstream log_; + sherpa::TeeStream tee_; + + OnlineWebsocketDecoder decoder_; + + mutable std::mutex mutex_; + + std::set> connections_; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_WEBSOCKET_ONLINE_WEBSOCKET_SERVER_IMPL_H_ diff --git a/sherpa/cpp_api/websocket/online-websocket-server.cc b/sherpa/cpp_api/websocket/online-websocket-server.cc new file mode 100644 index 000000000..511e414d3 --- /dev/null +++ b/sherpa/cpp_api/websocket/online-websocket-server.cc @@ -0,0 +1,132 @@ +// sherpa/cpp_api/websocket/online-websocket-server.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "asio.hpp" +#include "sherpa/cpp_api/websocket/online-websocket-server-impl.h" +#include "sherpa/csrc/log.h" +#include "torch/all.h" + +static constexpr const char *kUsageMessage = R"( +Automatic speech recognition with sherpa using websocket. + +Usage: + +sherpa-online-websocket-server --help + +sherpa-online-websocket-server \ + --use-gpu=false \ + --port=6006 \ + --num-work-threads=5 \ + --nn-model=/path/to/cpu.jit \ + --tokens=/path/to/tokens.txt \ + --decoding-method=greedy_search \ + --log-file=./log.txt +)"; + +int32_t main(int32_t argc, char *argv[]) { + torch::set_num_threads(1); + torch::set_num_interop_threads(1); + sherpa::InferenceMode no_grad; + + torch::jit::getExecutorMode() = false; + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(false); + + sherpa::ParseOptions po(kUsageMessage); + + sherpa::OnlineWebsocketServerConfig config; + + // the server will listen on this port, for both websocket and http + int32_t port = 6006; + + // size of the thread pool for handling network connections + int32_t num_io_threads = 1; + + // size of the thread pool for neural network computation and decoding + int32_t num_work_threads = 5; + + po.Register("num-io-threads", &num_io_threads, + "Number of threads to use for network connections."); + + po.Register("num-work-threads", &num_work_threads, + "Number of threads to use for neural network " + "computation and decoding."); + + po.Register("port", &port, "The port on which the server will listen."); + + config.Register(&po); + + if (argc == 1) { + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + po.Read(argc, argv); + + if (po.NumArgs() != 0) { + SHERPA_LOG(ERROR) << "Unrecognized positional arguments!"; + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + config.Validate(); + + asio::io_context io_conn; // for network connections + asio::io_context io_work; // for neural network and decoding + + sherpa::OnlineWebsocketServer server(io_conn, io_work, config); + server.Run(port); + + SHERPA_LOG(INFO) << "Listening on: " << port << "\n"; + // SHERPA_LOG(INFO) << "Number of I/O threads: " << num_io_threads << "\n"; + SHERPA_LOG(INFO) << "Number of work threads: " << num_work_threads << "\n"; + + // give some work to do for the io_work pool + auto work_guard = asio::make_work_guard(io_work); + + std::vector io_threads; + + // decrement since the main thread is also used for network communications + for (int32_t i = 0; i < num_io_threads - 1; ++i) { + io_threads.emplace_back([&io_conn]() { io_conn.run(); }); + } + + std::vector work_threads; + for (int32_t i = 0; i < num_work_threads; ++i) { + work_threads.emplace_back([&io_work]() { io_work.run(); }); + } + + // Print a message telling users how to access the HTTP service + std::ostringstream os; + os << "\nPlease access the HTTP server using the following address: \n\n"; + os << "http://localhost:" << port << "\n"; +#if 0 + // TODO(fangjun): Enable it for HTTPS + os << "http://127.0.0.1:" << port << "\n"; + asio::ip::tcp::resolver resolver(io_conn); + auto iter = resolver.resolve(asio::ip::host_name(), ""); + asio::ip::tcp::resolver::iterator end; + for (; iter != end; ++iter) { + asio::ip::tcp::endpoint ep = *iter; + asio::error_code ec; + os << "http://" << ep.address().to_string(ec) << ":" << port << "\n"; + if (ec) { + std::cout << "Error message: " << ec << "\n"; + } + } +#endif + SHERPA_LOG(INFO) << os.str(); + + io_conn.run(); + + for (auto &t : io_threads) { + t.join(); + } + + for (auto &t : work_threads) { + t.join(); + } + + return 0; +} diff --git a/sherpa/cpp_api/websocket/tee-stream.h b/sherpa/cpp_api/websocket/tee-stream.h new file mode 100644 index 000000000..855f63c31 --- /dev/null +++ b/sherpa/cpp_api/websocket/tee-stream.h @@ -0,0 +1,61 @@ +// Code in this file is copied and modified from +// https://wordaligned.org/articles/cpp-streambufs + +#ifndef SHERPA_CPP_API_WEBSOCKET_TEE_STREAM_H_ +#define SHERPA_CPP_API_WEBSOCKET_TEE_STREAM_H_ +#include +#include +#include + +namespace sherpa { + +template > +class basic_teebuf : public std::basic_streambuf { + public: + using int_type = typename traits::int_type; + + basic_teebuf(std::basic_streambuf *sb1, + std::basic_streambuf *sb2) + : sb1(sb1), sb2(sb2) {} + + private: + int sync() override { + int const r1 = sb1->pubsync(); + int const r2 = sb2->pubsync(); + return r1 == 0 && r2 == 0 ? 0 : -1; + } + + int_type overflow(int_type c) override { + int_type const eof = traits::eof(); + + if (traits::eq_int_type(c, eof)) { + return traits::not_eof(c); + } else { + char_type const ch = traits::to_char_type(c); + int_type const r1 = sb1->sputc(ch); + int_type const r2 = sb2->sputc(ch); + + return traits::eq_int_type(r1, eof) || traits::eq_int_type(r2, eof) ? eof + : c; + } + } + + private: + std::basic_streambuf *sb1; + std::basic_streambuf *sb2; +}; + +using teebuf = basic_teebuf; + +class TeeStream : public std::ostream { + public: + TeeStream(std::ostream &o1, std::ostream &o2) + : std::ostream(&tbuf), tbuf(o1.rdbuf(), o2.rdbuf()) {} + + private: + teebuf tbuf; +}; + +} // namespace sherpa + +#endif // SHERPA_CPP_API_WEBSOCKET_TEE_STREAM_H_ diff --git a/sherpa/csrc/CMakeLists.txt b/sherpa/csrc/CMakeLists.txt index 3d3eff7a6..66573a422 100644 --- a/sherpa/csrc/CMakeLists.txt +++ b/sherpa/csrc/CMakeLists.txt @@ -1,16 +1,53 @@ # Please sort the filenames alphabetically set(sherpa_srcs - fbank_features.cc - file_utils.cc + base64-decode.cc + byte_util.cc + context-graph.cc + fbank-features.cc + file-utils.cc hypothesis.cc log.cc - offline_asr.cc - parse_options.cc - rnnt_beam_search.cc - rnnt_conformer_model.cc - rnnt_conv_emformer_model.cc - rnnt_emformer_model.cc - symbol_table.cc + offline-conformer-ctc-model.cc + offline-conformer-transducer-model.cc + offline-ctc-greedy-search-decoder.cc + offline-ctc-one-best-decoder.cc + offline-model-config.cc + offline-nemo-enc-dec-ctc-model-bpe.cc + offline-sense-voice-model-config.cc + offline-sense-voice-model-meta-data.cc + offline-sense-voice-model.cc + offline-stream.cc + offline-transducer-fast-beam-search-decoder.cc + offline-transducer-greedy-search-decoder.cc + offline-transducer-modified-beam-search-decoder.cc + offline-wav2vec2-ctc-model.cc + offline-wenet-conformer-ctc-model.cc + offline-whisper-model-config.cc + offline-whisper-model-meta-data.cc + offline-whisper-model.cc + online-conformer-transducer-model.cc + online-conv-emformer-transducer-model.cc + online-emformer-transducer-model.cc + online-lstm-transducer-model.cc + online-stream.cc + online-transducer-fast-beam-search-decoder.cc + online-transducer-greedy-search-decoder.cc + online-transducer-modified-beam-search-decoder.cc + online-zipformer-transducer-model.cc + online-zipformer2-transducer-model.cc + parse-options.cc + resample.cc + silero-vad-model-config.cc + silero-vad-model.cc + symbol-table.cc + text-utils.cc + vad-model-config.cc + voice-activity-detector-impl.cc + voice-activity-detector.cc + # + speaker-embedding-extractor-model.cc + speaker-embedding-extractor.cc + speaker-embedding-extractor-impl.cc ) add_library(sherpa_core ${sherpa_srcs}) @@ -46,9 +83,15 @@ endif() if(SHERPA_ENABLE_TESTS) set(sherpa_test_srcs - test_hypothesis.cc - test_log.cc - test_parse_options.cc + # test-offline-conformer-transducer-model.cc + # test-online-conv-emformer-transducer-model.cc + + test-byte-util.cc + test-context-graph.cc + test-hypothesis.cc + test-log.cc + test-online-stream.cc + test-parse-options.cc ) function(sherpa_add_test source) @@ -80,22 +123,6 @@ if(SHERPA_ENABLE_TESTS) endforeach() endif() -add_executable(sherpa sherpa.cc) -target_link_libraries(sherpa sherpa_core) - -if(NOT WIN32) - execute_process( - COMMAND "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())" - OUTPUT_STRIP_TRAILING_WHITESPACE - OUTPUT_VARIABLE PYTHON_SITE_PACKAGE_DIR - ) - message(STATUS "PYTHON_SITE_PACKAGE_DIR: ${PYTHON_SITE_PACKAGE_DIR}") - target_link_libraries(sherpa "-Wl,-rpath,${PYTHON_SITE_PACKAGE_DIR}/sherpa/lib") - - target_link_libraries(sherpa "-Wl,-rpath,${SHERPA_RPATH_ORIGIN}/../lib") -endif() - - configure_file(version.h.in ${CMAKE_CURRENT_BINARY_DIR}/version.h @ONLY) message(STATUS "Generated ${CMAKE_CURRENT_BINARY_DIR}/version.h") # sherpa-version does not have any dependencies. Its purpose is to help us debug @@ -103,11 +130,21 @@ message(STATUS "Generated ${CMAKE_CURRENT_BINARY_DIR}/version.h") add_executable(sherpa-version version.cc) target_include_directories(sherpa-version PRIVATE ${CMAKE_BINARY_DIR}) +add_executable(sherpa-vad sherpa-vad.cc) +target_link_libraries(sherpa-vad sherpa_core) + +add_executable(sherpa-compute-speaker-similarity sherpa-compute-speaker-similarity.cc) +target_link_libraries(sherpa-compute-speaker-similarity sherpa_core) + install(TARGETS - sherpa_core kaldi_native_io_core + sherpa_core DESTINATION lib ) -install(TARGETS sherpa sherpa-version +install( + TARGETS + sherpa-version + sherpa-vad + sherpa-compute-speaker-similarity DESTINATION bin ) diff --git a/sherpa/csrc/base64-decode.cc b/sherpa/csrc/base64-decode.cc new file mode 100644 index 000000000..0b9a35df8 --- /dev/null +++ b/sherpa/csrc/base64-decode.cc @@ -0,0 +1,67 @@ +// sherpa/csrc/base64-decode.cc +// +// Copyright (c) 2022-2025 Xiaomi Corporation + +#include "sherpa/csrc/base64-decode.h" + +#include "sherpa/csrc/macros.h" + +namespace sherpa { + +static int32_t Ord(char c) { + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { + return c - 'a' + ('Z' - 'A') + 1; + } else if (c >= '0' && c <= '9') { + return c - '0' + ('Z' - 'A') + ('z' - 'a') + 2; + } else if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } + + SHERPA_LOGE("Unknown character %d, %c\n", c, c); + + exit(-1); +} + +// see +// https://github.com/ReneNyffenegger/cpp-base64/blob/master/base64.cpp#L243 +std::string Base64Decode(const std::string &s) { + if (s.empty()) { + SHERPA_LOGE("Empty string!"); + exit(-1); + } + + int32_t n = static_cast(s.size()) / 4 * 3; + + std::string ans; + ans.reserve(n); + + int32_t i = 0; + while (i < static_cast(s.size())) { + if (s[i] == '=') { + return " "; + } + + int32_t first = (Ord(s[i]) << 2) + ((Ord(s[i + 1]) & 0x30) >> 4); + ans.push_back(static_cast(first)); + + if (i + 2 < static_cast(s.size()) && s[i + 2] != '=') { + int32_t second = + ((Ord(s[i + 1]) & 0x0f) << 4) + ((Ord(s[i + 2]) & 0x3c) >> 2); + ans.push_back(static_cast(second)); + + if (i + 3 < static_cast(s.size()) && s[i + 3] != '=') { + int32_t third = ((Ord(s[i + 2]) & 0x03) << 6) + Ord(s[i + 3]); + ans.push_back(static_cast(third)); + } + } + i += 4; + } + + return ans; +} + +} // namespace sherpa diff --git a/sherpa/csrc/base64-decode.h b/sherpa/csrc/base64-decode.h new file mode 100644 index 000000000..f922c94dd --- /dev/null +++ b/sherpa/csrc/base64-decode.h @@ -0,0 +1,19 @@ +// sherpa/csrc/base64-decode.h +// +// Copyright (c) 2022-2025 Xiaomi Corporation + +#ifndef SHERPA_CSRC_BASE64_DECODE_H_ +#define SHERPA_CSRC_BASE64_DECODE_H_ + +#include + +namespace sherpa { + +/** @param s A base64 encoded string. + * @return Return the decoded string. + */ +std::string Base64Decode(const std::string &s); + +} // namespace sherpa + +#endif // SHERPA_CSRC_BASE64_DECODE_H_ diff --git a/sherpa/csrc/byte_util.cc b/sherpa/csrc/byte_util.cc new file mode 100644 index 000000000..98560db20 --- /dev/null +++ b/sherpa/csrc/byte_util.cc @@ -0,0 +1,221 @@ +/** Copyright 2023 Xiaomi Corporation (authors: Wei Kang) + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sherpa/csrc/byte_util.h" + +#include // NOLINT +#include + +#include "sherpa/csrc/log.h" + +namespace sherpa { + +ByteUtil::ByteUtil() { + // The table below is copied from + // https://github.com/k2-fsa/icefall/blob/master/icefall/byte_utils.py + // which is used to train byte level bpe, if you change the table in icefall + // you have to change the table below accordingly. + byte2token_ = std::vector( + {256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, + 284, 285, 286, 287, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, + 126, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, + 301, 302, 303, 304, 305, 308, 309, 310, 311, 312, 313, 314, 315, 316, + 317, 318, 321, 322, 323, 324, 325, 326, 327, 328, 330, 331, 332, 333, + 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, + 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, + 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, + 376, 377, 378, 379, 380, 381, 382, 384, 385, 386, 387, 388, 389, 390, + 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, + 419, 420, 421, 422}); + max_token_ = 422; // the max number in above table + token2byte_ = + std::vector(max_token_ + 1, -1); // the max token in byte2token_ + // is 422, so we set the length + // of token2bytes_ 423. + for (size_t i = 0; i < byte2token_.size(); ++i) { + token2byte_[byte2token_[i]] = i; + } +} + +std::string ByteUtil::Encode(const std::string &str) const { + std::ostringstream oss; + const uint8_t *p = reinterpret_cast(str.data()); + for (size_t i = 0; i < str.size(); ++i) { + oss << CodePointToUTF8String(byte2token_[p[i]]); + } + return oss.str(); +} + +std::string ByteUtil::Decode(const std::string &str) const { + std::vector bytes; + UTF8StringToTokensAndMapToBytes(str, &bytes); + std::vector codes; + BytesToCodePoints(bytes.data(), bytes.size(), &codes); + std::ostringstream oss; + for (size_t i = 0; i < codes.size(); ++i) { + oss << CodePointToUTF8String(codes[i]); + } + return oss.str(); +} + +void ByteUtil::UTF8StringToTokensAndMapToBytes( + const std::string &str, std::vector *bytes) const { + const char *data = str.data(); + bytes->clear(); + const size_t length = str.size(); + for (size_t i = 0; i < length; /* no update */) { + int32_t c = data[i++] & 0xff; + if ((c & 0x80) == 0) { + if (c > max_token_ || token2byte_[c] == -1) { + SHERPA_LOG(WARNING) << "Skip OOV token, code point : " << c + << " utf8 char : " << CodePointToUTF8String(c); + continue; + } + bytes->push_back(token2byte_[c]); + } else { + if ((c & 0xc0) == 0x80) { + SHERPA_LOG(FATAL) << "Invalid utf8 string : " << str + << ", code point : " << c; + } + int32_t count = + (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + (c >= 0xfc); + int32_t code = c & ((1 << (6 - count)) - 1); + while (count != 0) { + if (i == length) { + SHERPA_LOG(FATAL) + << "Invalid utf8 string : " << str << ", code point : " << code; + } + char cb = data[i++]; + if ((cb & 0xc0) != 0x80) { + SHERPA_LOG(FATAL) + << "Invalid utf8 string : " << str << ", code point : " << code; + } + code = (code << 6) | (cb & 0x3f); + count--; + } + if (code < 0) { + // This should not be able to happen. + SHERPA_LOG(FATAL) << "Invalid utf8 string : " << str + << ", code point : " << code; + } + if (code > max_token_ || token2byte_[code] == -1) { + SHERPA_LOG(WARNING) << "Skip OOV token, code point : " << code + << " utf8 char : " << CodePointToUTF8String(code); + continue; + } + bytes->push_back(token2byte_[code]); + } + } +} + +void ByteUtil::BytesToCodePoints(const uint8_t *bytes, int32_t length, + std::vector *codes) const { + if (length <= 0) { + return; + } + const char *data = reinterpret_cast(bytes); + int32_t idx = 1; // means starting from the next byte + for (int32_t i = 0; i < length; /* no update */) { + int32_t c = data[i++] & 0xff; + if ((c & 0x80) == 0) { + codes->push_back(c); + idx = i + 1; + } else { + if ((c & 0xc0) == 0x80) { + BytesToCodePoints(bytes + idx, length - idx, codes); + return; + } + int32_t count = + (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + (c >= 0xfc); + int32_t code = c & ((1 << (6 - count)) - 1); + while (count != 0) { + if (i == length) { + BytesToCodePoints(bytes + idx, length - idx, codes); + return; + } + char cb = data[i++]; + if ((cb & 0xc0) != 0x80) { + BytesToCodePoints(bytes + idx, length - idx, codes); + return; + } + code = (code << 6) | (cb & 0x3f); + count--; + } + if (code < 0) { + BytesToCodePoints(bytes + idx, length - idx, codes); + return; + } + codes->push_back(code); + idx = i + 1; + } + } +} + +std::string ByteUtil::CodePointToUTF8String(int32_t code) const { + std::ostringstream ostr; + if (code < 0) { + SHERPA_LOG(FATAL) << "Invalid utf8 code point : " << code; + return ostr.str(); // Unreachable code. + } else if (code < 0x80) { + ostr << static_cast(code); + } else if (code < 0x800) { + ostr << static_cast((code >> 6) | 0xc0); + ostr << static_cast((code & 0x3f) | 0x80); + } else if (code < 0x10000) { + ostr << static_cast((code >> 12) | 0xe0); + ostr << static_cast(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast((code & 0x3f) | 0x80); + } else if (code < 0x200000) { + ostr << static_cast((code >> 18) | 0xf0); + ostr << static_cast(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast((code & 0x3f) | 0x80); + } else if (code < 0x4000000) { + ostr << static_cast((code >> 24) | 0xf8); + ostr << static_cast(((code >> 18) & 0x3f) | 0x80); + ostr << static_cast(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast((code & 0x3f) | 0x80); + } else { + ostr << static_cast((code >> 30) | 0xfc); + ostr << static_cast(((code >> 24) & 0x3f) | 0x80); + ostr << static_cast(((code >> 18) & 0x3f) | 0x80); + ostr << static_cast(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast((code & 0x3f) | 0x80); + } + return ostr.str(); +} + +const ByteUtilPtr GetByteUtil() { + static ByteUtilPtr bu = nullptr; + static std::once_flag init_flag; + + std::call_once(init_flag, + []() { bu = std::make_shared(ByteUtil()); }); + SHERPA_CHECK_NE(bu, nullptr); + return bu; +} + +} // namespace sherpa diff --git a/sherpa/csrc/byte_util.h b/sherpa/csrc/byte_util.h new file mode 100644 index 000000000..c89702ee7 --- /dev/null +++ b/sherpa/csrc/byte_util.h @@ -0,0 +1,115 @@ +/** + * Copyright 2023 Xiaomi Corporation (authors: Wei Kang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SHERPA_CSRC_BYTE_UTIL_H_ +#define SHERPA_CSRC_BYTE_UTIL_H_ + +#include +#include +#include + +namespace sherpa { + +class ByteUtil; +using ByteUtilPtr = std::shared_ptr; + +/* The class implements the functions in byte_utils.py + * (https://github.com/k2-fsa/icefall/blob/master/icefall/byte_utils.py) + * It will be used to decode the output hypothesis of model trained with byte + * level bpe. + * + * Caution: The base characters (the byte token table) in the constructor MUST + * be the same as the `PRINTABLE_BASE_CHARS` in icefall. + */ +class ByteUtil { + public: + ByteUtil(); + /* + * Encode the normal string (for example, the transcripts in dataset) to a + * special utf8 characters sequence, the characters are all in the byte2token_ + * table (see in the constructor). It breaks the non-ascii characters into + * several characters (each byte a character), while the printable ascii will + * keep the same. + * + * @param str The original string. + * + * @returns Returns the encoded string. + */ + std::string Encode(const std::string &str) const; + + /* Decode the string encoded by Encode to its original one. + * str should be equal to Decode(Encode(str)). + * + * Note: The str here actually represents a sequence of bytes, the number of + * bytes equals to the number of utf8 characters, we will re-map this utf8 + * characters back to bytes with token2byte_ and then convert the bytes array + * to a string. Sometimes, there will be some invalid bytes in the array, we + * will drop these invalid bytes when decoding the bytes array. See more + * examples in test-byte-util.cc. + * + * @returns Return the deocded string. + */ + std::string Decode(const std::string &str) const; + + private: + int32_t max_token_; // The max token in byte2token_. + std::vector token2byte_; // map token to byte. + std::vector byte2token_; // map byte to token. + + /* Convert utf8 code points to corresponding character. + * @param code The utf8 code point. + * + * @return Returns the corresponding character (as std::string). + */ + std::string CodePointToUTF8String(int32_t code) const; + + /* Convert bytes to corresponding utf8 code points. + * + * Note: We will skip invalid bytes (i.e the bytes can not combine into a + * valid utf8 character). + * + * @param bytes The pointer to the bytes array. + * @param length The length of bytes array. + * @param code The utf8 code points will be written here. + */ + void BytesToCodePoints(const uint8_t *bytes, int32_t length, + std::vector *codes) const; + /* + * The utf8 string here is expected to be the encoded string (the string + * encoded by Encode or the recognition result from a asr system built with + * byte level bpe. + * + * This function first extract the utf8 characters from the str, then map them + * to byte with token2byte_. + * + * @param str The input string. + * @param bytes The converted bytes will be written here. + */ + void UTF8StringToTokensAndMapToBytes(const std::string &str, + std::vector *bytes) const; +}; + +/* + * Get the ByteUtil pointer, this guarantees the ByteUtil object only be + * initialized once. + */ +const ByteUtilPtr GetByteUtil(); + +} // namespace sherpa + +#endif // SHERPA_CSRC_BYTE_UTIL_H_ diff --git a/sherpa/csrc/context-graph.cc b/sherpa/csrc/context-graph.cc new file mode 100644 index 000000000..ab1b87d32 --- /dev/null +++ b/sherpa/csrc/context-graph.cc @@ -0,0 +1,110 @@ +/** + * Copyright 2023 Xiaomi Corporation (authors: Wei Kang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sherpa/csrc/context-graph.h" + +#include +#include +#include + +namespace sherpa { +void ContextGraph::Build( + const std::vector> &token_ids) const { + for (size_t i = 0; i < token_ids.size(); ++i) { + auto node = root_.get(); + for (size_t j = 0; j < token_ids[i].size(); ++j) { + int32_t token = token_ids[i][j]; + if (0 == node->next.count(token)) { + bool is_end = j == (token_ids[i].size() - 1); + node->next[token] = std::make_unique( + token, context_score_, node->node_score + context_score_, + is_end ? node->node_score + context_score_ : 0, is_end); + } + node = node->next[token].get(); + } + } + FillFailOutput(); +} + +std::pair ContextGraph::ForwardOneStep( + const ContextState *state, int32_t token) const { + const ContextState *node; + float score; + if (1 == state->next.count(token)) { + node = state->next.at(token).get(); + score = node->token_score; + } else { + node = state->fail; + while (0 == node->next.count(token)) { + node = node->fail; + if (-1 == node->token) break; // root + } + if (1 == node->next.count(token)) { + node = node->next.at(token).get(); + } + score = node->node_score - state->node_score; + } + SHERPA_CHECK(nullptr != node); + return std::make_pair(score + node->output_score, node); +} + +std::pair ContextGraph::Finalize( + const ContextState *state) const { + float score = -state->node_score; + return std::make_pair(score, root_.get()); +} + +void ContextGraph::FillFailOutput() const { + std::queue node_queue; + for (auto &kv : root_->next) { + kv.second->fail = root_.get(); + node_queue.push(kv.second.get()); + } + while (!node_queue.empty()) { + auto current_node = node_queue.front(); + node_queue.pop(); + for (auto &kv : current_node->next) { + auto fail = current_node->fail; + if (1 == fail->next.count(kv.first)) { + fail = fail->next.at(kv.first).get(); + } else { + fail = fail->fail; + while (0 == fail->next.count(kv.first)) { + fail = fail->fail; + if (-1 == fail->token) break; + } + if (1 == fail->next.count(kv.first)) + fail = fail->next.at(kv.first).get(); + } + kv.second->fail = fail; + // fill the output arc + auto output = fail; + while (!output->is_end) { + output = output->fail; + if (-1 == output->token) { + output = nullptr; + break; + } + } + kv.second->output = output; + kv.second->output_score += output == nullptr ? 0 : output->output_score; + node_queue.push(kv.second.get()); + } + } +} +} // namespace sherpa diff --git a/sherpa/csrc/context-graph.h b/sherpa/csrc/context-graph.h new file mode 100644 index 000000000..ab591b0c7 --- /dev/null +++ b/sherpa/csrc/context-graph.h @@ -0,0 +1,80 @@ +/** + * Copyright 2023 Xiaomi Corporation (authors: Wei Kang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SHERPA_CSRC_CONTEXT_GRAPH_H_ +#define SHERPA_CSRC_CONTEXT_GRAPH_H_ + +#include +#include +#include +#include + +#include "sherpa/csrc/log.h" + +namespace sherpa { + +class ContextGraph; +using ContextGraphPtr = std::shared_ptr; + +struct ContextState { + int32_t token; + float token_score; + float node_score; + float output_score; + bool is_end; + std::unordered_map> next; + const ContextState *fail = nullptr; + const ContextState *output = nullptr; + + ContextState() = default; + ContextState(int32_t token, float token_score, float node_score, + float output_score, bool is_end) + : token(token), + token_score(token_score), + node_score(node_score), + output_score(output_score), + is_end(is_end) {} +}; + +class ContextGraph { + public: + ContextGraph() = default; + ContextGraph(const std::vector> &token_ids, + float context_score) + : context_score_(context_score) { + root_ = std::make_unique(-1, 0, 0, 0, false); + root_->fail = root_.get(); + Build(token_ids); + } + + std::pair ForwardOneStep( + const ContextState *state, int32_t token_id) const; + std::pair Finalize( + const ContextState *state) const; + + const ContextState *Root() const { return root_.get(); } + + private: + float context_score_; + std::unique_ptr root_; + void Build(const std::vector> &token_ids) const; + void FillFailOutput() const; +}; + +} // namespace sherpa +#endif // SHERPA_CSRC_CONTEXT_GRAPH_H_ diff --git a/sherpa/csrc/fbank_features.cc b/sherpa/csrc/fbank-features.cc similarity index 83% rename from sherpa/csrc/fbank_features.cc rename to sherpa/csrc/fbank-features.cc index 83f138471..9aa6b4e4a 100644 --- a/sherpa/csrc/fbank_features.cc +++ b/sherpa/csrc/fbank-features.cc @@ -16,10 +16,12 @@ * limitations under the License. */ -#include "sherpa/csrc/fbank_features.h" +#include "sherpa/csrc/fbank-features.h" #include "kaldi_native_io/csrc/kaldi-io.h" #include "kaldi_native_io/csrc/wave-reader.h" +#include "kaldifeat/csrc/feature-fbank.h" +#include "kaldifeat/csrc/whisper-fbank.h" #include "sherpa/csrc/log.h" #include "torch/script.h" @@ -54,8 +56,9 @@ std::pair ReadWave(const std::string &filename, return {tensor / 32768, wave_data.Duration()}; } +template std::vector ComputeFeatures( - kaldifeat::Fbank &fbank, // NOLINT + FbankComputer &fbank, // NOLINT const std::vector &wave_data, std::vector *num_frames /*=nullptr*/) { const auto &frame_opts = fbank.GetOptions().frame_opts; @@ -84,4 +87,14 @@ std::vector ComputeFeatures( return ans; } +template std::vector ComputeFeatures( + kaldifeat::Fbank &fbank, // NOLINT + const std::vector &wave_data, + std::vector *num_frames = nullptr); + +template std::vector ComputeFeatures( + kaldifeat::WhisperFbank &fbank, // NOLINT + const std::vector &wave_data, + std::vector *num_frames = nullptr); + } // namespace sherpa diff --git a/sherpa/csrc/fbank_features.h b/sherpa/csrc/fbank-features.h similarity index 97% rename from sherpa/csrc/fbank_features.h rename to sherpa/csrc/fbank-features.h index bd05bf87e..9cf6c95b8 100644 --- a/sherpa/csrc/fbank_features.h +++ b/sherpa/csrc/fbank-features.h @@ -23,7 +23,6 @@ #include #include -#include "kaldifeat/csrc/feature-fbank.h" #include "torch/script.h" namespace sherpa { @@ -60,8 +59,9 @@ std::pair ReadWave(const std::string &filename, * number of feature frames and the number of columns equals to the * feature dimension. */ +template std::vector ComputeFeatures( - kaldifeat::Fbank &fbank, // NOLINT + FbankComputer &fbank, // NOLINT const std::vector &wave_data, std::vector *num_frames = nullptr); } // namespace sherpa diff --git a/sherpa/csrc/file_utils.cc b/sherpa/csrc/file-utils.cc similarity index 80% rename from sherpa/csrc/file_utils.cc rename to sherpa/csrc/file-utils.cc index f5c421c8b..1569cabf8 100644 --- a/sherpa/csrc/file_utils.cc +++ b/sherpa/csrc/file-utils.cc @@ -16,15 +16,23 @@ * limitations under the License. */ -#include "sherpa/csrc/file_utils.h" +#include "sherpa/csrc/file-utils.h" #include #include +#include "sherpa/csrc/log.h" + namespace sherpa { bool FileExists(const std::string &filename) { return std::ifstream(filename).good(); } +void AssertFileExists(const std::string &filename) { + if (!FileExists(filename)) { + SHERPA_LOG(FATAL) << filename << " does not exist!"; + } +} + } // namespace sherpa diff --git a/sherpa/csrc/file_utils.h b/sherpa/csrc/file-utils.h similarity index 88% rename from sherpa/csrc/file_utils.h rename to sherpa/csrc/file-utils.h index f2e8db166..b64af0ef3 100644 --- a/sherpa/csrc/file_utils.h +++ b/sherpa/csrc/file-utils.h @@ -31,6 +31,12 @@ namespace sherpa { */ bool FileExists(const std::string &filename); +/** Abort if the file does not exist. + * + * @param filename The file to check. + */ +void AssertFileExists(const std::string &filename); + } // namespace sherpa #endif // SHERPA_CSRC_FILE_UTILS_H_ diff --git a/sherpa/csrc/hypothesis.h b/sherpa/csrc/hypothesis.h index 3132267eb..cf29a5c9a 100644 --- a/sherpa/csrc/hypothesis.h +++ b/sherpa/csrc/hypothesis.h @@ -24,6 +24,7 @@ #include #include +#include "sherpa/csrc/context-graph.h" #include "torch/all.h" namespace sherpa { @@ -32,14 +33,22 @@ struct Hypothesis { // The predicted tokens so far. Newly predicated tokens are appended. std::vector ys; + // timestamps[i] contains the frame number after subsampling + // on which ys[i] is decoded. + std::vector timestamps; + // The total score of ys in log space. double log_prob = 0; + // The state of contextual-baising graph + const ContextState *context_state; + int32_t num_trailing_blanks = 0; Hypothesis() = default; - Hypothesis(const std::vector &ys, double log_prob) - : ys(ys), log_prob(log_prob) {} + Hypothesis(const std::vector &ys, double log_prob, + const ContextState *context_state = nullptr) + : ys(ys), log_prob(log_prob), context_state(context_state) {} // If two Hypotheses have the same `Key`, then they contain // the same token sequence. diff --git a/sherpa/csrc/log.cc b/sherpa/csrc/log.cc index 014ed2cb6..251b503f5 100644 --- a/sherpa/csrc/log.cc +++ b/sherpa/csrc/log.cc @@ -31,20 +31,43 @@ // Useful to decode the stack trace, but only used if we have execinfo.h #endif // SHERPA_HAVE_CXXABI_H #endif // SHERPA_HAVE_EXECINFO_H - #include +#include // NOLINT +#include #include #include +#include #include +#include + +class log_watch { + public: + static constexpr auto decimal_width = 3; + explicit log_watch(const std::string &format = "%FT%T") : m_format(format) {} + friend std::ostream &operator<<(std::ostream &, const log_watch &); + + private: + std::string m_format; +}; + +std::ostream &operator<<(std::ostream &os, const log_watch &lw) { + auto time_point = std::chrono::system_clock::now(); + std::time_t t = std::chrono::system_clock::to_time_t(time_point); + os << std::put_time(std::localtime(&t), lw.m_format.c_str()); + auto dur = time_point.time_since_epoch(); + auto ss = std::chrono::duration_cast(dur) % + std::chrono::seconds{1}; + os << std::setfill('0') << std::setw(lw.decimal_width) << ss.count(); + return os; +} namespace sherpa { std::string GetDateTimeStr() { + log_watch ms("%F %T."); std::ostringstream os; - std::time_t t = std::time(nullptr); - std::tm tm = *std::localtime(&t); - os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss + os << ms; // yyyy-mm-dd hh:mm:ss.sss return os.str(); } diff --git a/sherpa/csrc/log.h b/sherpa/csrc/log.h index d27179069..698d92727 100644 --- a/sherpa/csrc/log.h +++ b/sherpa/csrc/log.h @@ -51,7 +51,31 @@ enum class LogLevel { // // SHERPA_LOG(TRACE) << "some message"; // SHERPA_LOG(DEBUG) << "some message"; -// + +#ifdef TRACE +#undef TRACE +#endif + +#ifdef DEBUG +#undef DEBUG +#endif + +#ifdef INFO +#undef INFO +#endif + +#ifdef WARNING +#undef WARNING +#endif + +#ifdef ERROR +#undef ERROR +#endif + +#ifdef FATAL +#undef FATAL +#endif + constexpr LogLevel TRACE = LogLevel::kTrace; constexpr LogLevel DEBUG = LogLevel::kDebug; constexpr LogLevel INFO = LogLevel::kInfo; @@ -121,9 +145,8 @@ class Logger { public: Logger(const char *filename, const char *func_name, uint32_t line_num, LogLevel level) - : filename_(filename), - func_name_(func_name), - line_num_(line_num), + : + level_(level) { cur_level_ = GetCurrentLogLevel(); switch (level) { @@ -244,15 +267,12 @@ class Logger { } // specialization to fix compile error: `stringstream << nullptr` is ambiguous - const Logger &operator<<(const std::nullptr_t &null) const { + const Logger &operator<<(const std::nullptr_t & /*null*/) const { if (cur_level_ <= level_) *this << "(null)"; return *this; } private: - const char *filename_; - const char *func_name_; - uint32_t line_num_; LogLevel level_; LogLevel cur_level_; }; diff --git a/sherpa/csrc/macros.h b/sherpa/csrc/macros.h new file mode 100644 index 000000000..b66b31aff --- /dev/null +++ b/sherpa/csrc/macros.h @@ -0,0 +1,22 @@ +// sherpa/csrc/macros.h +// +// Copyright 2025 Xiaomi Corporation + +#ifndef SHERPA_CSRC_MACROS_H_ +#define SHERPA_CSRC_MACROS_H_ +#include +#include + +#include + +#define SHERPA_LOGE(...) \ + do { \ + fprintf(stderr, "%s:%s:%d ", __FILE__, __func__, \ + static_cast(__LINE__)); \ + fprintf(stderr, ##__VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } while (0) + +#define SHERPA_EXIT(code) exit(code) + +#endif // SHERPA_CSRC_MACROS_H_ diff --git a/sherpa/csrc/offline-conformer-ctc-model.cc b/sherpa/csrc/offline-conformer-ctc-model.cc new file mode 100644 index 000000000..37c86cd72 --- /dev/null +++ b/sherpa/csrc/offline-conformer-ctc-model.cc @@ -0,0 +1,49 @@ +// sherpa/csrc/offline-conformer-ctc-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/csrc/offline-conformer-ctc-model.h" + +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OfflineConformerCtcModel::OfflineConformerCtcModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); +} + +torch::IValue OfflineConformerCtcModel::Forward(torch::Tensor features, + torch::Tensor features_length) { + InferenceMode no_grad; + + int32_t batch_size = features.size(0); + + torch::Dict sup; + sup.insert("sequence_idx", torch::arange(batch_size, torch::kInt)); + sup.insert("start_frame", torch::zeros({batch_size}, torch::kInt)); + sup.insert("num_frames", features_length.cpu().to(torch::kInt)); + + torch::IValue supervisions(sup); + + return model_.run_method("forward", features.to(device_), sup); +} + +torch::Tensor OfflineConformerCtcModel::GetLogSoftmaxOut( + torch::IValue forward_out) const { + return forward_out.toTuple()->elements()[0].toTensor(); +} + +torch::Tensor OfflineConformerCtcModel::GetLogSoftmaxOutLength( + torch::IValue forward_out) const { + InferenceMode no_grad; + + auto mask = forward_out.toTuple()->elements()[2].toTensor(); + return (~mask).sum(1); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-conformer-ctc-model.h b/sherpa/csrc/offline-conformer-ctc-model.h new file mode 100644 index 000000000..c017074c8 --- /dev/null +++ b/sherpa/csrc/offline-conformer-ctc-model.h @@ -0,0 +1,51 @@ +// sherpa/csrc/offline-conformer-ctc-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_CONFORMER_CTC_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_CONFORMER_CTC_MODEL_H_ + +#include +#include + +#include "sherpa/csrc/offline-ctc-model.h" +namespace sherpa { + +/** This class models the Conformer model from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conformer_ctc/train.py#L668 + */ +class OfflineConformerCtcModel : public OfflineCtcModel { + public: + /** + * @param filename Path name of the torch script model. + * @param device The model will be moved to this device + */ + explicit OfflineConformerCtcModel(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::Device Device() const override { return device_; } + + int32_t SubsamplingFactor() const override { return 4; } + + /** Run the forward method of the model. + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conformer_ctc/transformer.py#L162 + * for its documentation in Python. + */ + torch::IValue Forward(torch::Tensor features, + torch::Tensor features_length) override; + + torch::Tensor GetLogSoftmaxOut(torch::IValue forward_out) const override; + + torch::Tensor GetLogSoftmaxOutLength( + torch::IValue forward_out) const override; + + private: + torch::Device device_; + torch::jit::Module model_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_CONFORMER_CTC_MODEL_H_ diff --git a/sherpa/csrc/offline-conformer-transducer-model.cc b/sherpa/csrc/offline-conformer-transducer-model.cc new file mode 100644 index 000000000..1fa674f3d --- /dev/null +++ b/sherpa/csrc/offline-conformer-transducer-model.cc @@ -0,0 +1,73 @@ +// sherpa/csrc/offline-conformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-conformer-transducer-model.h" + +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OfflineConformerTransducerModel::OfflineConformerTransducerModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + // See + // https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/model.py#L29 + // for the definition of `model_`. + // + // Note: pruned_transducer_statelessX where X>=2 has the same model + // architecture. We use pruned_transducer_stateless2 as an exmaple here, but + // it applies also to pruned_transducer_stateless3, + // pruned_transducer_stateless4, etc. + model_ = torch::jit::load(filename, device); + model_.eval(); + + encoder_ = model_.attr("encoder").toModule(); + decoder_ = model_.attr("decoder").toModule(); + joiner_ = model_.attr("joiner").toModule(); + + encoder_proj_ = joiner_.attr("encoder_proj").toModule(); + decoder_proj_ = joiner_.attr("decoder_proj").toModule(); + + context_size_ = decoder_.attr("context_size").toInt(); +} + +std::pair +OfflineConformerTransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length) { + InferenceMode no_grad; + + auto outputs = + encoder_.run_method("forward", features, features_length).toTuple(); + + auto encoder_out = outputs->elements()[0]; + auto encoder_out_length = outputs->elements()[1].toTensor(); + + auto projected_encoder_out = + encoder_proj_.run_method("forward", encoder_out).toTensor(); + + return {projected_encoder_out, encoder_out_length}; +} + +torch::Tensor OfflineConformerTransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + auto decoder_out = + decoder_.run_method("forward", decoder_input, /*need_pad*/ false); + + return decoder_proj_.run_method("forward", decoder_out).toTensor(); +} + +torch::Tensor OfflineConformerTransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_ + .run_method("forward", encoder_out, decoder_out, + /*project_input*/ false) + .toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-conformer-transducer-model.h b/sherpa/csrc/offline-conformer-transducer-model.h new file mode 100644 index 000000000..62f9c0cdc --- /dev/null +++ b/sherpa/csrc/offline-conformer-transducer-model.h @@ -0,0 +1,94 @@ +// sherpa/csrc/offline-conformer-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_CONFORMER_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_CONFORMER_TRANSDUCER_MODEL_H_ + +#include +#include + +#include "sherpa/csrc/offline-transducer-model.h" + +namespace sherpa { + +/** This class implements models from pruned_transducer_statelessX + * where X>=2 from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/model.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OfflineConformerTransducerModel : public OfflineTransducerModel { + public: + explicit OfflineConformerTransducerModel(const std::string &filename, + torch::Device device = torch::kCPU); + + /** + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py#L127 + * for the interface of the encoder module. + * Note that we use the default value warmup 1.0 here. + * + * Also, the output is transformed by using the projection module from + * the joiner, please see + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py#L34 + */ + std::pair RunEncoder( + const torch::Tensor &features, + const torch::Tensor &features_length) override; + + // It returns the projected decoder out. + /** + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py#L82 + * for the interface of the decoder module. + * + * We set `need_pad` to false inside this method. + * + * Also, the output is transformed by using the projection module from + * the joiner, please see + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py#L35 + */ + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + /** + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/joiner.py#L38 + * for the interface of the joiner module. + * + * We set `project_input` to false inside this method. + * + * Both inputs are of shape (N, joiner_dim). The output shape is + * (N, vocab_size). + */ + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + /* See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py#L67 + * for the definition and usage of context_size. + */ + int32_t ContextSize() const override { return context_size_; } + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + torch::jit::Module encoder_proj_; + torch::jit::Module decoder_proj_; + + torch::Device device_{"cpu"}; + int32_t context_size_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_CONFORMER_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/offline-ctc-decoder.h b/sherpa/csrc/offline-ctc-decoder.h new file mode 100644 index 000000000..68acfbb5e --- /dev/null +++ b/sherpa/csrc/offline-ctc-decoder.h @@ -0,0 +1,42 @@ +// sherpa/csrc/offline-ctc-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_CTC_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_CTC_DECODER_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "torch/script.h" + +namespace sherpa { + +struct OfflineCtcDecoderResult { + /// The decoded token IDs + std::vector tokens; + + /// timestamps[i] contains the output frame index where tokens[i] is decoded. + std::vector timestamps; +}; + +class OfflineCtcDecoder { + public: + virtual ~OfflineCtcDecoder() = default; + + /** Run CTC decoder given the output from the encoder model. + * + * @param log_prob A 3-D tensor of shape (N, T, vocab_size) + * @param log_prob_len A 1-D tensor of shape (N,) containing number + * of valid frames in encoder_out before padding. + * @param subsampling_factor Subsampling factor of the model. + * + * @return Return a vector of size `N` containing the decoded results. + */ + virtual std::vector Decode( + torch::Tensor log_prob, torch::Tensor log_prob_len, + int32_t subsampling_factor = 1) = 0; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_CTC_DECODER_H_ diff --git a/sherpa/csrc/offline-ctc-greedy-search-decoder.cc b/sherpa/csrc/offline-ctc-greedy-search-decoder.cc new file mode 100644 index 000000000..50e16acc5 --- /dev/null +++ b/sherpa/csrc/offline-ctc-greedy-search-decoder.cc @@ -0,0 +1,45 @@ +// sherpa/csrc/offline-ctc-greedy-search-decoder.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/offline-ctc-greedy-search-decoder.h" + +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +std::vector OfflineCtcGreedySearchDecoder::Decode( + torch::Tensor logits, torch::Tensor logits_len, + int32_t subsampling_factor /*= 1*/) { + InferenceMode no_grad; + + int32_t batch_size = logits.size(0); + + torch::Tensor indexes = logits.argmax(-1); + + logits_len = logits_len.to(torch::kInt).cpu(); + + auto p_len = logits_len.accessor(); + + std::vector results(batch_size); + + for (int32_t i = 0; i != batch_size; ++i) { + torch::Tensor this_indexes = indexes.index({i}).slice(0, 0, p_len[i]); + + this_indexes = std::get<0>(torch::unique_consecutive(this_indexes)); + + // assume that the blank id is 0 + torch::Tensor non_zero_indexes = this_indexes.nonzero().squeeze(); + torch::Tensor tokens = + this_indexes.index_select(0, non_zero_indexes).cpu().to(torch::kInt); + + results[i].tokens = {tokens.data_ptr(), + tokens.data_ptr() + tokens.numel()}; + } + + return results; +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-ctc-greedy-search-decoder.h b/sherpa/csrc/offline-ctc-greedy-search-decoder.h new file mode 100644 index 000000000..f6b0c90a6 --- /dev/null +++ b/sherpa/csrc/offline-ctc-greedy-search-decoder.h @@ -0,0 +1,22 @@ +// sherpa/csrc/offline-ctc-greedy-search-decoder.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_CTC_GREEDY_SEARCH_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_CTC_GREEDY_SEARCH_DECODER_H_ + +#include + +#include "sherpa/csrc/offline-ctc-decoder.h" + +namespace sherpa { + +class OfflineCtcGreedySearchDecoder : public OfflineCtcDecoder { + public: + std::vector Decode( + torch::Tensor logits, torch::Tensor logits_len, + int32_t subsampling_factor = 1) override; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_CTC_GREEDY_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/offline-ctc-model.h b/sherpa/csrc/offline-ctc-model.h new file mode 100644 index 000000000..61c4856be --- /dev/null +++ b/sherpa/csrc/offline-ctc-model.h @@ -0,0 +1,58 @@ +// sherpa/csrc/offline-ctc-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_CTC_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_CTC_MODEL_H_ + +#include + +#include "torch/script.h" + +namespace sherpa { + +class OfflineCtcModel { + public: + virtual ~OfflineCtcModel() = default; + + // Subsampling factor of the model + virtual int32_t SubsamplingFactor() const = 0; + + // Number of modeling unit. Should be equal to + // GetLogSoftmaxOut().size(-1) + int32_t VocabSize() const { return vocab_size_; } + + // Return the underlying device where computation would happen + virtual torch::Device Device() const = 0; + + /** Run the model with a given input. + * + * @param features A 3-D tensor of shape (N, T, C). + * @param features_length A 1-D tensor of shape (N,). + */ + virtual torch::IValue Forward(torch::Tensor features, + torch::Tensor features_length) = 0; + + // Get the log softmax output of the network from the output of Forward + // method. + // The returned tensor has shape (N, T, C). + virtual torch::Tensor GetLogSoftmaxOut(torch::IValue forward_out) const = 0; + + // Get the output length before padding from the output of Forward method. + // The returned tensor has shape (N,) + virtual torch::Tensor GetLogSoftmaxOutLength( + torch::IValue forward_out) const = 0; + + // Send some fake data to the model for computation + virtual void WarmUp(torch::Tensor features, torch::Tensor features_length) { + auto ivalue = Forward(features, features_length); + auto log_prob = GetLogSoftmaxOut(ivalue); + vocab_size_ = log_prob.size(-1); + } + + protected: + int32_t vocab_size_ = -1; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_CTC_MODEL_H_ diff --git a/sherpa/csrc/offline-ctc-one-best-decoder.cc b/sherpa/csrc/offline-ctc-one-best-decoder.cc new file mode 100644 index 000000000..795ef9085 --- /dev/null +++ b/sherpa/csrc/offline-ctc-one-best-decoder.cc @@ -0,0 +1,87 @@ +// sherpa/csrc/offline-ctc-one-best-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-ctc-one-best-decoder.h" + +#include + +#include "sherpa/cpp_api/macros.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +OfflineCtcOneBestDecoder::OfflineCtcOneBestDecoder( + const OfflineCtcDecoderConfig &config, torch::Device device, + int32_t vocab_size) + : config_(config), vocab_size_(vocab_size) { + if (config.hlg.empty()) { + // Use CTC topo since no HLG is provided + SHERPA_CHECK_GT(vocab_size, 1); + + decoding_graph_ = k2::GetCtcTopo(vocab_size - 1, config.modified, device); + } else { + decoding_graph_ = k2::LoadFsaClass(config.hlg, device); + + k2::ScaleTensorAttribute(decoding_graph_, config.lm_scale, "scores"); + } +} + +std::vector OfflineCtcOneBestDecoder::Decode( + torch::Tensor log_prob, torch::Tensor log_prob_len, + int32_t subsampling_factor /*= 1*/) { + if (vocab_size_ > 0) { + SHERPA_CHECK_EQ(log_prob.size(2), vocab_size_); + } + + InferenceMode no_grad; + + auto lattice = k2::GetLattice(log_prob, log_prob_len.cpu(), decoding_graph_, + config_.search_beam, config_.output_beam, + config_.min_active_states, + config_.max_active_states, subsampling_factor); + + lattice = k2::ShortestPath(lattice); + std::vector results(log_prob.size(0)); + + // Get tokens and timestamps from the lattice + auto labels = k2::GetTensorAttr(lattice, "labels").cpu().contiguous(); + auto acc = labels.accessor(); + + OfflineCtcDecoderResult *p = results.data(); + + bool last_token_is_blank = false; + + for (int32_t i = 0, t = 0; i != labels.numel(); ++i) { + int32_t token = acc[i]; + + if (token == -1) { + // end of this utterance. + t = 0; + ++p; + + continue; + } + + if (token == 0) { + ++t; + last_token_is_blank = true; + continue; + } + if (t != 0 && !p->tokens.empty() && token == p->tokens.back() && + (!last_token_is_blank)) { + // This is a repeat, skip it. + ++t; + continue; + } + + p->tokens.push_back(token); + p->timestamps.push_back(t); + ++t; + last_token_is_blank = false; + } // for (int32_t i = 0, t = 0; i != labels.numel(); ++i) + + return results; +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-ctc-one-best-decoder.h b/sherpa/csrc/offline-ctc-one-best-decoder.h new file mode 100644 index 000000000..d3a35672e --- /dev/null +++ b/sherpa/csrc/offline-ctc-one-best-decoder.h @@ -0,0 +1,35 @@ +// sherpa/csrc/offline-ctc-one-best-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_CTC_ONE_BEST_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_CTC_ONE_BEST_DECODER_H_ + +#include + +#include "k2/torch_api.h" +#include "sherpa/cpp_api/offline-recognizer.h" +#include "sherpa/csrc/offline-ctc-decoder.h" + +namespace sherpa { + +class OfflineCtcOneBestDecoder : public OfflineCtcDecoder { + public: + /** + * @param vocab_size Output dimension of the model. + */ + OfflineCtcOneBestDecoder(const OfflineCtcDecoderConfig &config, + torch::Device device, int32_t vocab_size); + + std::vector Decode( + torch::Tensor log_prob, torch::Tensor log_prob_len, + int32_t subsampling_factor = 1) override; + + private: + OfflineCtcDecoderConfig config_; + k2::FsaClassPtr decoding_graph_; + int32_t vocab_size_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_CTC_ONE_BEST_DECODER_H_ diff --git a/sherpa/csrc/offline-model-config.cc b/sherpa/csrc/offline-model-config.cc new file mode 100644 index 000000000..588a605e6 --- /dev/null +++ b/sherpa/csrc/offline-model-config.cc @@ -0,0 +1,57 @@ +// sherpa/csrc/offline-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/csrc/offline-model-config.h" + +#include + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/macros.h" + +namespace sherpa { + +void OfflineModelConfig::Register(ParseOptions *po) { + sense_voice.Register(po); + whisper.Register(po); + + // TODO(fangjun): enable it + // po->Register("tokens", &tokens, "Path to tokens.txt"); + + po->Register("debug", &debug, + "true to print model information while loading it."); + + // TODO(fangjun): Enable it + // po->Register("use-gpu", &use_gpu "true to CUDA. false to use CPU."); +} + +bool OfflineModelConfig::Validate() const { + if (!FileExists(tokens)) { + SHERPA_LOGE("tokens: '%s' does not exist", tokens.c_str()); + return false; + } + + if (!sense_voice.model.empty()) { + return sense_voice.Validate(); + } + + if (!whisper.model.empty()) { + return whisper.Validate(); + } + + return true; +} + +std::string OfflineModelConfig::ToString() const { + std::ostringstream os; + + os << "OfflineModelConfig("; + os << "sense_voice=" << sense_voice.ToString() << ", "; + os << "whisper=" << whisper.ToString() << ", "; + os << "tokens=\"" << tokens << "\", "; + os << "debug=" << (debug ? "True" : "False") << ", "; + os << "use_gpu=" << (debug ? "True" : "False") << ")"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-model-config.h b/sherpa/csrc/offline-model-config.h new file mode 100644 index 000000000..310649f17 --- /dev/null +++ b/sherpa/csrc/offline-model-config.h @@ -0,0 +1,41 @@ +// sherpa/csrc/offline-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_MODEL_CONFIG_H_ +#define SHERPA_CSRC_OFFLINE_MODEL_CONFIG_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/offline-sense-voice-model-config.h" +#include "sherpa/csrc/offline-whisper-model-config.h" + +namespace sherpa { + +struct OfflineModelConfig { + OfflineSenseVoiceModelConfig sense_voice; + OfflineWhisperModelConfig whisper; + + std::string tokens; + bool debug = false; + bool use_gpu = false; + + OfflineModelConfig() = default; + OfflineModelConfig(const OfflineSenseVoiceModelConfig &sense_voice, + const OfflineWhisperModelConfig &whisper, + const std::string &tokens, bool debug, bool use_gpu) + : sense_voice(sense_voice), + whisper(whisper), + tokens(tokens), + debug(debug), + use_gpu(use_gpu) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_MODEL_CONFIG_H_ diff --git a/sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.cc b/sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.cc new file mode 100644 index 000000000..dee6ea221 --- /dev/null +++ b/sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.cc @@ -0,0 +1,56 @@ +// sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.cc +// +// Copyright (c) 2023 Xiaomi Corporation + +#include "sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.h" + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OfflineNeMoEncDecCTCModelBPE::OfflineNeMoEncDecCTCModelBPE( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); +} + +torch::IValue OfflineNeMoEncDecCTCModelBPE::Forward( + torch::Tensor features, torch::Tensor features_length) { + InferenceMode no_grad; + + // Change (N, T, C) to (N, C, T) + features = features.permute({0, 2, 1}); + + return model_.run_method("forward", features.to(device_), + features_length.to(device_)); +} + +torch::Tensor OfflineNeMoEncDecCTCModelBPE::GetLogSoftmaxOut( + torch::IValue forward_out) const { + auto logit = forward_out.toTensor(); + return logit.roll(1 /*shift right with 1 column*/, 2 /*dim*/); +} + +torch::Tensor OfflineNeMoEncDecCTCModelBPE::GetLogSoftmaxOutLength( + torch::IValue forward_out) const { + // We return an undefined tensor and the caller should use + // the features_length and subsampling_factor_ to figure out + // the actual length + return {}; +} + +void OfflineNeMoEncDecCTCModelBPE::WarmUp(torch::Tensor features, + torch::Tensor features_length) { + // For Citrinet, the subsampling_factor_ is 8 + // For Conformer CTC, the subsampling_factor_ is 4. + auto ivalue = Forward(features, features_length); + auto log_prob = GetLogSoftmaxOut(ivalue); + + vocab_size_ = log_prob.size(-1); + subsampling_factor_ = + (features_length.cpu().to(torch::kInt).item() + 7) / + log_prob.size(1); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.h b/sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.h new file mode 100644 index 000000000..c870c498a --- /dev/null +++ b/sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.h @@ -0,0 +1,68 @@ +// sherpa/csrc/offline-nemo-enc-dec-ctc-model-bpe.h +// +// Copyright (c) 2023 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_NEMO_ENC_DEC_CTC_MODEL_BPE_H_ +#define SHERPA_CSRC_OFFLINE_NEMO_ENC_DEC_CTC_MODEL_BPE_H_ + +#include +#include + +#include "sherpa/csrc/offline-ctc-model.h" +namespace sherpa { + +/** This class models the EncDecCTCModelBPE model from NeMo. + * + * See + * https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/ctc_bpe_models.py + * https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/ctc_models.py + */ +class OfflineNeMoEncDecCTCModelBPE : public OfflineCtcModel { + public: + /** + * @param filename Path name of the torch script model. + * @param device The model will be moved to this device + */ + explicit OfflineNeMoEncDecCTCModelBPE(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::Device Device() const override { return device_; } + + int32_t SubsamplingFactor() const override { return subsampling_factor_; } + + /** Run the encoder of the model. + * + * See + * https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/models/ctc_models.py#L196 + * for its documentation in Python. + * + * @param features A 3-D tensor of shape (N, T, C). + * Caution: We permute it to (N, C, T) inside. + * @param features_length A 3-D tensor of shape (N,) + * @return Return a 3-D tensor of shape (N, T, C). It represents + * the log_prob. + */ + torch::IValue Forward(torch::Tensor features, + torch::Tensor features_length) override; + + /** Note: In NeMo, the last column of forward_out represent blank. + * We move it to the first column in this function. + */ + torch::Tensor GetLogSoftmaxOut(torch::IValue forward_out) const override; + + torch::Tensor GetLogSoftmaxOutLength( + torch::IValue forward_out) const override; + + // we need to set the subsampling_factor_ inside it + void WarmUp(torch::Tensor features, torch::Tensor features_length) override; + + private: + torch::Device device_; + torch::jit::Module model_; + int32_t subsampling_factor_ = 0; +}; + +using OfflineNeMoEncDecCTCModel = OfflineNeMoEncDecCTCModelBPE; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_NEMO_ENC_DEC_CTC_MODEL_BPE_H_ diff --git a/sherpa/csrc/offline-sense-voice-model-config.cc b/sherpa/csrc/offline-sense-voice-model-config.cc new file mode 100644 index 000000000..efe6440d4 --- /dev/null +++ b/sherpa/csrc/offline-sense-voice-model-config.cc @@ -0,0 +1,54 @@ +// sherpa/csrc/offline-sense-voice-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/offline-sense-voice-model-config.h" + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/macros.h" + +namespace sherpa { + +void OfflineSenseVoiceModelConfig::Register(ParseOptions *po) { + po->Register("sense-voice-model", &model, "Path to model.pt of SenseVoice."); + po->Register( + "sense-voice-language", &language, + "Valid values: auto, zh, en, ja, ko, yue. If left empty, auto is used"); + po->Register( + "sense-voice-use-itn", &use_itn, + "True to enable inverse text normalization. False to disable it."); +} + +bool OfflineSenseVoiceModelConfig::Validate() const { + if (!FileExists(model)) { + SHERPA_LOGE("SenseVoice model '%s' does not exist", model.c_str()); + return false; + } + + if (!language.empty()) { + if (language != "auto" && language != "zh" && language != "en" && + language != "ja" && language != "ko" && language != "yue") { + SHERPA_LOGE( + "Invalid sense-voice-language: '%s'. Valid values are: auto, zh, en, " + "ja, ko, yue. Or you can leave it empty to use 'auto'", + language.c_str()); + + return false; + } + } + + return true; +} + +std::string OfflineSenseVoiceModelConfig::ToString() const { + std::ostringstream os; + + os << "OfflineSenseVoiceModelConfig("; + os << "model=\"" << model << "\", "; + os << "language=\"" << language << "\", "; + os << "use_itn=" << (use_itn ? "True" : "False") << ")"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-sense-voice-model-config.h b/sherpa/csrc/offline-sense-voice-model-config.h new file mode 100644 index 000000000..9205b9c13 --- /dev/null +++ b/sherpa/csrc/offline-sense-voice-model-config.h @@ -0,0 +1,39 @@ +// sherpa/csrc/offline-sense-voice-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_CONFIG_H_ +#define SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_CONFIG_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" + +namespace sherpa { + +struct OfflineSenseVoiceModelConfig { + std::string model; + + // "" or "auto" to let the model recognize the language + // valid values: + // zh, en, ja, ko, yue, auto + std::string language = "auto"; + + // true to use inverse text normalization + // false to not use inverse text normalization + bool use_itn = false; + + OfflineSenseVoiceModelConfig() = default; + explicit OfflineSenseVoiceModelConfig(const std::string &model, + const std::string &language, + bool use_itn) + : model(model), language(language), use_itn(use_itn) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_CONFIG_H_ diff --git a/sherpa/csrc/offline-sense-voice-model-meta-data.cc b/sherpa/csrc/offline-sense-voice-model-meta-data.cc new file mode 100644 index 000000000..2f1ac1070 --- /dev/null +++ b/sherpa/csrc/offline-sense-voice-model-meta-data.cc @@ -0,0 +1,41 @@ +// sherpa/csrc/offline-sense-voice-model-meta-data.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/csrc/offline-sense-voice-model-meta-data.h" + +#include +namespace sherpa { + +std::string OfflineSenseVoiceModelMetaData::ToString() const { + std::ostringstream os; + os << "----SenseVoice metadata----\n"; + os << " with_itn_id: " << with_itn_id << "\n"; + os << " without_itn_id: " << without_itn_id << "\n"; + os << " window_size: " << window_size << "\n"; + os << " window_shift: " << window_shift << "\n"; + os << " vocab_size: " << vocab_size << "\n"; + os << " subsampling_factor: " << subsampling_factor << "\n"; + os << " normalize_samples: " << normalize_samples << "\n"; + os << " blank_id: " << blank_id << "\n"; + for (const auto &p : lang2id) { + os << " " << p.first << ": " << p.second << "\n"; + } + os << " neg_mean (" << neg_mean.size(1) << "): "; + + auto p = neg_mean.data_ptr(); + for (int32_t i = 0; i < 10; ++i) { + os << p[i] << ", "; + } + os << "\n"; + + os << " inv_stddev (" << inv_stddev.size(1) << "): "; + p = inv_stddev.data_ptr(); + for (int32_t i = 0; i < 10; ++i) { + os << p[i] << ", "; + } + os << "\n"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-sense-voice-model-meta-data.h b/sherpa/csrc/offline-sense-voice-model-meta-data.h new file mode 100644 index 000000000..ba796ae75 --- /dev/null +++ b/sherpa/csrc/offline-sense-voice-model-meta-data.h @@ -0,0 +1,53 @@ +// sherpa/csrc/offline-sense-voice-model-meta-data.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_META_DATA_H_ +#define SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_META_DATA_H_ + +#include +#include +#include + +#include "torch/script.h" + +namespace sherpa { + +struct OfflineSenseVoiceModelMetaData { + // ID for using inverse text normalization + int32_t with_itn_id; + + // ID for not using inverse text normalization + int32_t without_itn_id; + + int32_t window_size; // lfr_m + int32_t window_shift; // lfr_n + int32_t vocab_size; + + int32_t subsampling_factor = 1; + + // Usually 0 for SenseVoice models. + // 0 means samples are scaled to [-32768, 32767] before they are sent to the + // feature extractor + int32_t normalize_samples = 0; + + int32_t blank_id = 0; + + // possible values: + // zh, en, ja, ko, yue, auto + // where + // zh is Chinese (Mandarin) + // en is English + // ja is Japanese + // ko is Korean + // yue is Cantonese + // auto is to let the model recognize the language + std::unordered_map lang2id; + + torch::Tensor neg_mean; // 2-d float32, (1, feat_dim) + torch::Tensor inv_stddev; // 2-d float32, (1, feat_dim) + + std::string ToString() const; +}; +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_META_DATA_H_ diff --git a/sherpa/csrc/offline-sense-voice-model.cc b/sherpa/csrc/offline-sense-voice-model.cc new file mode 100644 index 000000000..07f681e98 --- /dev/null +++ b/sherpa/csrc/offline-sense-voice-model.cc @@ -0,0 +1,137 @@ +// sherpa/csrc/offline-sense-voice-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/csrc/offline-sense-voice-model.h" + +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" +#include "sherpa/csrc/macros.h" + +namespace sherpa { + +static std::vector ToFloat(const std::string &s) { + const float *p = reinterpret_cast(s.data()); + int32_t n = s.size() / 4; + + // assume little endian + return {p, p + n}; +} + +class OfflineSenseVoiceModel::Impl { + public: + explicit Impl(const OfflineModelConfig &config) { + torch::jit::ExtraFilesMap meta_data{ + {"model_type", {}}, {"lfr_window_size", {}}, + {"lfr_window_shift", {}}, {"neg_mean", {}}, + {"inv_stddev", {}}, {"vocab_size", {}}, + {"normalize_samples", {}}, {"version", {}}, + {"model_author", {}}, {"maintainer", {}}, + {"lang_auto", {}}, {"lang_zh", {}}, + {"lang_en", {}}, {"lang_yue", {}}, + {"lang_ja", {}}, {"lang_ko", {}}, + {"lang_nospeech", {}}, {"with_itn", {}}, + {"without_itn", {}}, {"url", {}}, + }; + if (config.use_gpu) { + device_ = torch::Device{torch::kCUDA}; + } + + model_ = torch::jit::load(config.sense_voice.model, device_, meta_data); + model_.eval(); + + if (meta_data.at("model_type") != "SenseVoiceSmall") { + SHERPA_LOGE("Expect a SenseVoiceSmall model. Given: '%s'", + meta_data.at("model_type").c_str()); + SHERPA_EXIT(-1); + } + + InitMetaData(meta_data); + + if (config.debug) { + SHERPA_LOGE("%s", meta_data_.ToString().c_str()); + } + } + + const OfflineSenseVoiceModelMetaData &GetModelMetadata() const { + return meta_data_; + } + + torch::Device Device() const { return device_; } + + std::pair RunForward( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &language, const torch::Tensor &use_itn) { + InferenceMode no_grad; + + auto outputs = + model_ + .run_method("forward", features, features_length, language, use_itn) + .toTuple(); + + auto logits = outputs->elements()[0].toTensor(); + auto logits_length = outputs->elements()[1].toTensor(); + + return {logits, logits_length}; + } + + private: + void InitMetaData(const torch::jit::ExtraFilesMap &meta_data) { + meta_data_.with_itn_id = atoi(meta_data.at("with_itn").c_str()); + meta_data_.without_itn_id = atoi(meta_data.at("without_itn").c_str()); + meta_data_.window_size = atoi(meta_data.at("lfr_window_size").c_str()); + meta_data_.window_shift = atoi(meta_data.at("lfr_window_shift").c_str()); + meta_data_.vocab_size = atoi(meta_data.at("vocab_size").c_str()); + meta_data_.normalize_samples = + atoi(meta_data.at("normalize_samples").c_str()); + + meta_data_.lang2id["auto"] = atoi(meta_data.at("lang_auto").c_str()); + meta_data_.lang2id["zh"] = atoi(meta_data.at("lang_zh").c_str()); + meta_data_.lang2id["en"] = atoi(meta_data.at("lang_en").c_str()); + meta_data_.lang2id["yue"] = atoi(meta_data.at("lang_yue").c_str()); + meta_data_.lang2id["ko"] = atoi(meta_data.at("lang_ko").c_str()); + meta_data_.lang2id["ja"] = atoi(meta_data.at("lang_ja").c_str()); + + auto neg_mean = ToFloat(meta_data.at("neg_mean")); + auto inv_stddev = ToFloat(meta_data.at("inv_stddev")); + + meta_data_.neg_mean = + torch::from_blob(neg_mean.data(), + {1, static_cast(neg_mean.size())}, + torch::kFloat32) + .clone(); + + meta_data_.inv_stddev = + torch::from_blob(inv_stddev.data(), + {1, static_cast(inv_stddev.size())}, + torch::kFloat32) + .clone(); + } + + private: + torch::jit::Module model_; + OfflineSenseVoiceModelMetaData meta_data_; + torch::Device device_{torch::kCPU}; +}; + +OfflineSenseVoiceModel::OfflineSenseVoiceModel(const OfflineModelConfig &config) + : impl_(std::make_unique(config)) {} + +OfflineSenseVoiceModel::~OfflineSenseVoiceModel() = default; + +const OfflineSenseVoiceModelMetaData &OfflineSenseVoiceModel::GetModelMetadata() + const { + return impl_->GetModelMetadata(); +} + +torch::Device OfflineSenseVoiceModel::Device() const { return impl_->Device(); } + +std::pair OfflineSenseVoiceModel::RunForward( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &language, const torch::Tensor &use_itn) { + return impl_->RunForward(features, features_length, language, use_itn); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-sense-voice-model.h b/sherpa/csrc/offline-sense-voice-model.h new file mode 100644 index 000000000..0bdd66362 --- /dev/null +++ b/sherpa/csrc/offline-sense-voice-model.h @@ -0,0 +1,38 @@ +// sherpa/csrc/offline-sense-voice-model.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_H_ + +#include +#include +#include + +#include "sherpa/csrc/offline-model-config.h" +#include "sherpa/csrc/offline-sense-voice-model-meta-data.h" +#include "torch/script.h" + +namespace sherpa { + +class OfflineSenseVoiceModel { + public: + explicit OfflineSenseVoiceModel(const OfflineModelConfig &config); + + ~OfflineSenseVoiceModel(); + + const OfflineSenseVoiceModelMetaData &GetModelMetadata() const; + + torch::Device Device() const; + + std::pair RunForward( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &language, const torch::Tensor &use_itn); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_SENSE_VOICE_MODEL_H_ diff --git a/sherpa/csrc/offline-stream.cc b/sherpa/csrc/offline-stream.cc new file mode 100644 index 000000000..b62b7bca9 --- /dev/null +++ b/sherpa/csrc/offline-stream.cc @@ -0,0 +1,218 @@ +// sherpa/cpp_api/offline-stream.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/cpp_api/offline-stream.h" + +#include +#include +#include + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/csrc/fbank-features.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +std::string OfflineRecognitionResult::AsJsonString() const { + std::ostringstream os; + os << "{"; + + os << "\"text\"" + << ": "; + os << std::quoted(text) << ", "; + + std::string sep = ""; + for (auto t : timestamps) { + os << sep << std::fixed << std::setprecision(2) << t; + sep = ", "; + } + os << "], "; + + os << "\"" + << "tokens" + << "\"" + << ":"; + os << "["; + + sep = ""; + auto oldFlags = os.flags(); + for (const auto &t : tokens) { + if (t.size() == 1 && static_cast(t[0]) > 0x7f) { + const uint8_t *p = reinterpret_cast(t.c_str()); + os << sep << "\"" + << "<0x" << std::hex << std::uppercase << static_cast(p[0]) + << ">" + << "\""; + os.flags(oldFlags); + } else { + os << sep << std::quoted(t); + } + sep = ", "; + } + os << "]"; + + os << "}"; + + return os.str(); +} + +class OfflineStream::OfflineStreamImpl { + public: + OfflineStreamImpl(kaldifeat::Fbank *fbank, const FeatureConfig &feat_config, + ContextGraphPtr context_graph) + : fbank_(fbank), + feat_config_(feat_config), + context_graph_(context_graph) { + if (!feat_config_.nemo_normalize.empty()) { + SHERPA_CHECK_EQ(feat_config_.nemo_normalize, "per_feature") + << "Only per_feature is implemented at present"; + } + } + + OfflineStreamImpl(kaldifeat::WhisperFbank *whisper, + const FeatureConfig &feat_config, + ContextGraphPtr context_graph) + : whisper_(whisper), + feat_config_(feat_config), + context_graph_(context_graph) { + if (!feat_config_.nemo_normalize.empty()) { + SHERPA_CHECK_EQ(feat_config_.nemo_normalize, "per_feature") + << "Only per_feature is implemented at present"; + } + } + + void AcceptWaveFile(const std::string &wave_file) { + torch::Tensor samples; + if (fbank_) { + samples = ReadWave(wave_file, fbank_->GetFrameOptions().samp_freq).first; + } else { + samples = + ReadWave(wave_file, whisper_->GetFrameOptions().samp_freq).first; + } + + if (!feat_config_.normalize_samples) { + samples.mul_(32767); + } + + if (feat_config_.return_waveform) { + // We return audio samples directly, e.g., for Wav2Vec2.0 + features_ = samples; + } else { + if (fbank_) { + features_ = ComputeFeatures(*fbank_, {samples})[0]; + } else { + features_ = ComputeFeatures(*whisper_, {samples})[0]; + } + features_ = Normalize(features_); + } + } + + void AcceptSamples(const float *samples, int32_t n) { + torch::Tensor tensor = + torch::from_blob(const_cast(samples), {n}, torch::kFloat); + + if (!feat_config_.normalize_samples) { + tensor.mul_(32767); + } + + if (feat_config_.return_waveform) { + // We return audio samples directly, e.g., for Wav2Vec2.0 + features_ = tensor.clone(); + } else { + if (fbank_) { + features_ = ComputeFeatures(*fbank_, {tensor})[0]; + } else { + features_ = ComputeFeatures(*whisper_, {tensor})[0]; + } + features_ = Normalize(features_); + } + } + + void AcceptFeatures(const float *features, int32_t num_frames, + int32_t num_channels) { + features_ = torch::from_blob(const_cast(features), + {num_frames, num_channels}, torch::kFloat) + .clone(); + } + + const torch::Tensor &GetFeatures() const { return features_; } + + void SetResult(const OfflineRecognitionResult &r) { result_ = r; } + + const OfflineRecognitionResult &GetResult() const { return result_; } + + const ContextGraphPtr &GetContextGraph() const { return context_graph_; } + + private: + torch::Tensor Normalize(torch::Tensor features) const { + if (feat_config_.nemo_normalize.empty()) { + return features; + } + + if (feat_config_.nemo_normalize == "per_feature") { + torch::Tensor mean = features.mean(0 /*dim*/, true /*keepdim*/); + torch::Tensor std = features.std(0 /*dim*/, true /*keepdim*/); + + return (features - mean) / (std + 1e-5f); + } + + SHERPA_LOG(FATAL) << "Unsupported nemo_normalize: " + << feat_config_.nemo_normalize; + return {}; // unreachable code; to make the compiler happy + } + + private: + torch::Tensor features_; + OfflineRecognitionResult result_; + kaldifeat::Fbank *fbank_ = nullptr; // not owned + kaldifeat::WhisperFbank *whisper_ = nullptr; // not owned + FeatureConfig feat_config_; + ContextGraphPtr context_graph_; +}; + +OfflineStream::~OfflineStream() = default; + +OfflineStream::OfflineStream(kaldifeat::Fbank *fbank, + const FeatureConfig &feat_config, + ContextGraphPtr context_graph /* nullptr */) + : impl_(std::make_unique(fbank, feat_config, + context_graph)) {} + +OfflineStream::OfflineStream(kaldifeat::WhisperFbank *whisper, + const FeatureConfig &feat_config, + ContextGraphPtr context_graph /* nullptr */) + : impl_(std::make_unique(whisper, feat_config, + context_graph)) {} + +void OfflineStream::AcceptWaveFile(const std::string &filename) { + impl_->AcceptWaveFile(filename); +} + +void OfflineStream::AcceptSamples(const float *samples, int32_t n) { + impl_->AcceptSamples(samples, n); +} + +void OfflineStream::AcceptFeatures(const float *features, int32_t num_frames, + int32_t num_channels) { + impl_->AcceptFeatures(features, num_frames, num_channels); +} + +const torch::Tensor &OfflineStream::GetFeatures() const { + return impl_->GetFeatures(); +} + +const ContextGraphPtr &OfflineStream::GetContextGraph() const { + return impl_->GetContextGraph(); +} + +/** Set the recognition result for this stream. */ +void OfflineStream::SetResult(const OfflineRecognitionResult &r) { + impl_->SetResult(r); +} + +/** Get the recognition result of this stream */ +const OfflineRecognitionResult &OfflineStream::GetResult() const { + return impl_->GetResult(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-transducer-decoder.h b/sherpa/csrc/offline-transducer-decoder.h new file mode 100644 index 000000000..c4dabe5a1 --- /dev/null +++ b/sherpa/csrc/offline-transducer-decoder.h @@ -0,0 +1,43 @@ +// sherpa/csrc/offline-transducer-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_TRANSDUCER_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_TRANSDUCER_DECODER_H_ + +#include + +#include "sherpa/cpp_api/offline-stream.h" +#include "torch/script.h" + +namespace sherpa { + +struct OfflineTransducerDecoderResult { + /// The decoded token IDs + std::vector tokens; + + /// timestamps[i] contains the output frame index where tokens[i] is decoded. + std::vector timestamps; +}; + +class OfflineTransducerDecoder { + public: + virtual ~OfflineTransducerDecoder() = default; + + /** Run transducer beam search given the output from the encoder model. + * + * @param encoder_out A 3-D tensor of shape (N, T, joiner_dim) + * @param encoder_out_length A 1-D tensor of shape (N,) containing number + * of valid frames in encoder_out before padding. + * @param ss Pointer to an array of streams. + * @param n Size of the input array. + * + * @return Return a vector of size `N` containing the decoded results. + */ + virtual std::vector Decode( + torch::Tensor encoder_out, torch::Tensor encoder_out_length, + OfflineStream **ss = nullptr, int32_t n = 0) = 0; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_TRANSDUCER_DECODER_H_ diff --git a/sherpa/csrc/offline-transducer-fast-beam-search-decoder.cc b/sherpa/csrc/offline-transducer-fast-beam-search-decoder.cc new file mode 100644 index 000000000..15a1f93b0 --- /dev/null +++ b/sherpa/csrc/offline-transducer-fast-beam-search-decoder.cc @@ -0,0 +1,122 @@ +// sherpa/csrc/offline-transducer-fast-beam-search-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/csrc/offline-transducer-fast-beam-search-decoder.h" + +#include + +#include "k2/torch_api.h" + +namespace sherpa { + +OfflineTransducerFastBeamSearchDecoder::OfflineTransducerFastBeamSearchDecoder( + OfflineTransducerModel *model, const FastBeamSearchConfig &config) + : model_(model), config_(config), vocab_size_(model->VocabSize()) { + if (config.lg.empty()) { + // Use a trivial graph + decoding_graph_ = k2::GetTrivialGraph(vocab_size_ - 1, model_->Device()); + } else { + decoding_graph_ = k2::LoadFsaClass(config.lg, model_->Device()); + k2::ScaleTensorAttribute(decoding_graph_, config.ngram_lm_scale, "scores"); + } +} + +std::vector +OfflineTransducerFastBeamSearchDecoder::Decode(torch::Tensor encoder_out, + torch::Tensor encoder_out_length, + OfflineStream **ss /*= nullptr*/, + int32_t n /*= 0*/) { + TORCH_CHECK(encoder_out.dim() == 3, encoder_out.dim(), " vs ", 3); + + auto device = model_->Device(); + int32_t context_size = model_->ContextSize(); + + int32_t batch_size = encoder_out.size(0); + int32_t num_frames = encoder_out.size(1); + + std::vector stream_vec; + stream_vec.reserve(batch_size); + for (int32_t i = 0; i != batch_size; ++i) { + stream_vec.push_back(k2::CreateRnntStream(decoding_graph_)); + } + + k2::RnntStreamsPtr streams = + k2::CreateRnntStreams(stream_vec, vocab_size_, context_size, config_.beam, + config_.max_contexts, config_.max_states); + + k2::RaggedShapePtr shape; + torch::Tensor contexts; + + for (int32_t t = 0; t != num_frames; ++t) { + std::tie(shape, contexts) = k2::GetRnntContexts(streams); + contexts = contexts.to(torch::kLong); + // contexts.shape: (num_hyps, context_size) + + auto decoder_out = model_->RunDecoder(contexts).unsqueeze(1); + // decoder_out.shape: (num_hyps, 1, 1, joiner_dim) + + auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), t}); + // cur_encoder_out has shape (N, joiner_dim) + + auto index = k2::RowIds(shape, 1).to(torch::kLong).to(device); + cur_encoder_out = cur_encoder_out.index_select(/*dim*/ 0, /*index*/ index); + // cur_encoder_out has shape (num_hyps, joiner_dim) + + cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); + // cur_encoder_out.shape (num_hyps, 1, 1, joiner_dim) + + auto logits = model_->RunJoiner(cur_encoder_out, decoder_out); + // logits.shape: (num_hyps, 1, 1, vocab_size) + + logits = logits.squeeze(1).squeeze(1); + // logits.shape: (num_hyps, vocab_size) + + auto log_probs = logits.log_softmax(-1); + k2::AdvanceRnntStreams(streams, log_probs); + } + + k2::TerminateAndFlushRnntStreams(streams); + + encoder_out_length = encoder_out_length.cpu().to(torch::kInt); + std::vector processed_frames_vec( + encoder_out_length.data_ptr(), + encoder_out_length.data_ptr() + encoder_out_length.numel()); + + auto lattice = + k2::FormatOutput(streams, processed_frames_vec, config_.allow_partial); + + lattice = k2::ShortestPath(lattice); + + std::vector results(batch_size); + + // Get tokens and timestamps from the lattice + auto labels = k2::GetTensorAttr(lattice, "labels").cpu().contiguous(); + auto acc = labels.accessor(); + + OfflineTransducerDecoderResult *p = results.data(); + + for (int32_t i = 0, t = 0; i != labels.numel(); ++i) { + int32_t token = acc[i]; + + if (token == -1) { + // end of this utterance. + t = 0; + ++p; + + continue; + } + + if (token == 0) { + ++t; + continue; + } + + p->tokens.push_back(token); + p->timestamps.push_back(t); + ++t; + } // for (int32_t i = 0, t = 0; i != labels.numel(); ++i) + + return results; +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-transducer-fast-beam-search-decoder.h b/sherpa/csrc/offline-transducer-fast-beam-search-decoder.h new file mode 100644 index 000000000..fae08b592 --- /dev/null +++ b/sherpa/csrc/offline-transducer-fast-beam-search-decoder.h @@ -0,0 +1,45 @@ +// sherpa/csrc/offline-transducer-fast-beam-search-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_TRANSDUCER_FAST_BEAM_SEARCH_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_TRANSDUCER_FAST_BEAM_SEARCH_DECODER_H_ +#include + +#include "k2/torch_api.h" +#include "sherpa/cpp_api/fast-beam-search-config.h" +#include "sherpa/cpp_api/offline-stream.h" +#include "sherpa/csrc/offline-transducer-decoder.h" +#include "sherpa/csrc/offline-transducer-model.h" + +namespace sherpa { + +class OfflineTransducerFastBeamSearchDecoder : public OfflineTransducerDecoder { + public: + OfflineTransducerFastBeamSearchDecoder(OfflineTransducerModel *model, + const FastBeamSearchConfig &config); + + /** Run fast_beam_search given the output from the encoder model. + * + * @param encoder_out A 3-D tensor of shape (N, T, joiner_dim) + * @param encoder_out_length A 1-D tensor of shape (N,) containing number + * of valid frames in encoder_out before padding. + * @param ss Pointer to an array of streams. + * @param n Size of the input array. + * + * @return Return a vector of size `N` containing the decoded results. + */ + std::vector Decode( + torch::Tensor encoder_out, torch::Tensor encoder_out_length, + OfflineStream **ss = nullptr, int32_t n = 0) override; + + private: + OfflineTransducerModel *model_; // Not owned + k2::FsaClassPtr decoding_graph_; + + FastBeamSearchConfig config_; + int32_t vocab_size_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_TRANSDUCER_FAST_BEAM_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/offline-transducer-greedy-search-decoder.cc b/sherpa/csrc/offline-transducer-greedy-search-decoder.cc new file mode 100644 index 000000000..d3fb8dec0 --- /dev/null +++ b/sherpa/csrc/offline-transducer-greedy-search-decoder.cc @@ -0,0 +1,149 @@ +// sherpa/csrc/offline-transducer-greedy-search-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-transducer-greedy-search-decoder.h" + +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" +#include "torch/all.h" + +namespace sherpa { + +/** + * Construct the decoder input from the current hypothesis. + * + * @param hyps A list-of-list of token IDs containing the current decoding + * results. Its length is `batch_size` + * @param decoder_input A 2-D tensor of shape (batch_size, context_size). + */ +static void BuildDecoderInput( + const std::vector &r, + torch::Tensor *decoder_input) { + int32_t batch_size = decoder_input->size(0); + int32_t context_size = decoder_input->size(1); + int64_t *p = decoder_input->data_ptr(); + for (int32_t i = 0; i != batch_size; ++i) { + auto start = r[i].tokens.end() - context_size; + auto end = r[i].tokens.end(); + std::copy(start, end, p); + p += context_size; + } +} + +std::vector +OfflineTransducerGreedySearchDecoder::Decode(torch::Tensor encoder_out, + torch::Tensor encoder_out_length, + OfflineStream **ss /*= nullptr*/, + int32_t n /*= 0*/) { + InferenceMode no_grad; + + TORCH_CHECK(encoder_out.dim() == 3, "encoder_out.dim() is ", + encoder_out.dim(), "Expected value is 3"); + TORCH_CHECK(encoder_out.scalar_type() == torch::kFloat, + "encoder_out.scalar_type() is ", encoder_out.scalar_type()); + + TORCH_CHECK(encoder_out_length.dim() == 1, "encoder_out_length.dim() is", + encoder_out_length.dim()); + TORCH_CHECK(encoder_out_length.scalar_type() == torch::kLong, + "encoder_out_length.scalar_type() is ", + encoder_out_length.scalar_type()); + + TORCH_CHECK(encoder_out_length.device().is_cpu()); + + torch::Device device = model_->Device(); + + torch::nn::utils::rnn::PackedSequence packed_seq = + torch::nn::utils::rnn::pack_padded_sequence(encoder_out, + encoder_out_length, + /*batch_first*/ true, + /*enforce_sorted*/ false); + + int32_t blank_id = 0; // hard-code + int32_t context_size = model_->ContextSize(); + + int32_t N = encoder_out_length.size(0); + + std::vector results(N); + + std::vector padding(context_size, -1); + padding.back() = blank_id; + + for (auto &r : results) { + // We will remove the padding at the end + r.tokens = padding; + } + + auto decoder_input = + torch::full({N, context_size}, -1, + torch::dtype(torch::kLong) + .memory_format(torch::MemoryFormat::Contiguous)); + + // set the last column to blank_id, i.e., decoder_input[:, -1] = blank_id + decoder_input.index({torch::indexing::Slice(), -1}) = blank_id; + + // its shape is (N, 1, joiner_dim) + auto decoder_out = model_->RunDecoder(decoder_input.to(device)); + + using torch::indexing::Slice; + auto batch_sizes_accessor = packed_seq.batch_sizes().accessor(); + + int32_t max_T = packed_seq.batch_sizes().numel(); + + int32_t offset = 0; + for (int32_t t = 0; t != max_T; ++t) { + int32_t cur_batch_size = batch_sizes_accessor[t]; + int32_t start = offset; + int32_t end = start + cur_batch_size; + auto cur_encoder_out = packed_seq.data().index({Slice(start, end)}); + offset = end; + + cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); + // Now cur_encoder_out is of shape (cur_batch_size, 1, 1, joiner_dim) + if (cur_batch_size < decoder_out.size(0)) { + decoder_out = decoder_out.index({Slice(0, cur_batch_size)}); + } + + auto logits = model_->RunJoiner(cur_encoder_out, decoder_out.unsqueeze(1)); + // logits' shape is (cur_batch_size, 1, 1, vocab_size) + // logits is the output of nn.Linear. Since we are using greedy search + // and only the magnitude matters, we don't invoke log_softmax here + + logits = logits.squeeze(1).squeeze(1); + auto max_indices = logits.argmax(/*dim*/ -1).cpu(); + auto max_indices_accessor = max_indices.accessor(); + bool emitted = false; + for (int32_t k = 0; k != cur_batch_size; ++k) { + auto index = max_indices_accessor[k]; + if (index != blank_id) { + emitted = true; + results[k].tokens.push_back(index); + results[k].timestamps.push_back(t); + } + } + + if (emitted) { + BuildDecoderInput(results, &decoder_input); + decoder_out = model_->RunDecoder(decoder_input.to(device)); + } + } // for (int32_t t = 0; t != max_T; ++t) { + + auto unsorted_indices = packed_seq.unsorted_indices().cpu(); + auto unsorted_indices_accessor = unsorted_indices.accessor(); + + std::vector ans(N); + + for (int32_t i = 0; i != N; ++i) { + int32_t k = unsorted_indices_accessor[i]; + torch::ArrayRef arr(results[k].tokens); + ans[i].tokens = arr.slice(context_size).vec(); + ans[i].timestamps = std::move(results[k].timestamps); + } + + return ans; +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-transducer-greedy-search-decoder.h b/sherpa/csrc/offline-transducer-greedy-search-decoder.h new file mode 100644 index 000000000..f36713602 --- /dev/null +++ b/sherpa/csrc/offline-transducer-greedy-search-decoder.h @@ -0,0 +1,40 @@ +// sherpa/csrc/offline-transducer-greedy-search-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_ + +#include + +#include "sherpa/cpp_api/offline-stream.h" +#include "sherpa/csrc/offline-transducer-decoder.h" +#include "sherpa/csrc/offline-transducer-model.h" + +namespace sherpa { + +class OfflineTransducerGreedySearchDecoder : public OfflineTransducerDecoder { + public: + explicit OfflineTransducerGreedySearchDecoder(OfflineTransducerModel *model) + : model_(model) {} + + /** Run greedy search given the output from the encoder model. + * + * @param encoder_out A 3-D tensor of shape (N, T, joiner_dim) + * @param encoder_out_length A 1-D tensor of shape (N,) containing number + * of valid frames in encoder_out before padding. + * @param ss Pointer to an array of streams. + * @param n Size of the input array. + * + * @return Return a vector of size `N` containing the decoded results. + */ + std::vector Decode( + torch::Tensor encoder_out, torch::Tensor encoder_out_length, + OfflineStream **ss = nullptr, int32_t n = 0) override; + + private: + OfflineTransducerModel *model_; // Not owned +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/offline-transducer-model.h b/sherpa/csrc/offline-transducer-model.h new file mode 100644 index 000000000..a8a8aaa62 --- /dev/null +++ b/sherpa/csrc/offline-transducer-model.h @@ -0,0 +1,106 @@ +// sherpa/csrc/offline-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_TRANSDUCER_MODEL_H_ + +#include + +#include "torch/script.h" + +namespace sherpa { + +class OfflineTransducerModel { + public: + virtual ~OfflineTransducerModel() = default; + + /** Run the encoder network. + * + * @param features A 3-D tensor of shape (N, T, C) + * @param features_length A 1-D tensor of shape (N,) containing number of + * valid frames in `features` before padding. + * + * @return Return a pair containing: + * - encoder_out: A 3-D tensor of shape (N, T', encoder_dim) + * - encoder_out_length: A 1-D tensor of shape (N,) containing number + * of frames in `encoder_out` before padding. + */ + virtual std::pair RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length) = 0; + + /** Run the decoder network. + * + * Caution: We assume there are no recurrent connections in the decoder and + * the decoder is stateless. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py + * for an example + * + * @param decoder_input It is usually of shape (N, context_size) + * @return Return a tensor of shape (N, 1, decoder_dim). + */ + virtual torch::Tensor RunDecoder(const torch::Tensor &decoder_input) = 0; + + /** Run the joint network. + * + * @param encoder_out Output of the encoder network. A tensor of shape + * (N, encoder_dim). + * @param decoder_out Output of the decoder network. A tensor of shape + * (N, decoder_dim). + * @return Return a tensor of shape (N, vocab_size). In icefall, the last + * last layer of the joint network is `nn.Linear`, + * not `nn.LogSoftmax`. + */ + virtual torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) = 0; + + /** Return the device where computation takes place. + * + * Note: We don't support moving the model to a different device + * after construction. + */ + virtual torch::Device Device() const = 0; + + /** If we are using a stateless decoder and if it contains a + * Conv1D, this function returns the kernel size of the convolution layer. + */ + virtual int32_t ContextSize() const = 0; + virtual int32_t SubsamplingFactor() const { return 4; } + + int32_t VocabSize() const { return vocab_size_; } + + void WarmUp(torch::Tensor features, torch::Tensor features_length) { + torch::Tensor encoder_out; + torch::Tensor encoder_out_length; + + std::tie(encoder_out, encoder_out_length) = + RunEncoder(features, features_length); + // encoder_out.shape (N, T, joiner_dim) + // encoder_out_length.shape (N,) + + auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), 0}); + // cur_encoder_out.shape (N, joiner_dim) + + cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); + // cur_encoder_out.shape (N, 1, 1, joiner_dim) + + torch::Tensor decoder_input = + torch::zeros({features_length.size(0), ContextSize()}, torch::kLong) + .to(Device()); + // decoder_input.shape (N, context_size) + + auto decoder_out = RunDecoder(decoder_input).unsqueeze(1); + // decoder_out.shape (N, 1, 1, joiner_dim) + + auto logits = RunJoiner(cur_encoder_out, decoder_out); + // logits.shape (N, 1, 1, vocab_size) + + vocab_size_ = logits.size(-1); + } + + private: + int32_t vocab_size_ = -1; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/offline-transducer-modified-beam-search-decoder.cc b/sherpa/csrc/offline-transducer-modified-beam-search-decoder.cc new file mode 100644 index 000000000..09c30ebf4 --- /dev/null +++ b/sherpa/csrc/offline-transducer-modified-beam-search-decoder.cc @@ -0,0 +1,276 @@ +// sherpa/csrc/offline-transducer-modified-beam-search-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-transducer-modified-beam-search-decoder.h" + +#include +#include +#include + +#include "k2/torch_api.h" +#include "sherpa/csrc/hypothesis.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +static torch::Tensor FloorDivide(torch::Tensor a, int32_t b) { +#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ + (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR > 7) + return torch::div(a, b, /*rounding_mode*/ "trunc"); +#else + return torch::floor_divide(a, b); +#endif +} + +static torch::Tensor BuildDecoderInput(const std::vector &hyps, + int32_t context_size) { + int32_t num_hyps = hyps.size(); + torch::Tensor decoder_input = + torch::empty({num_hyps, context_size}, + torch::dtype(torch::kLong) + .memory_format(torch::MemoryFormat::Contiguous)); + + int64_t *p = decoder_input.data_ptr(); + for (const auto &h : hyps) { + auto start = h.ys.end() - context_size; + auto end = h.ys.end(); + + std::copy(start, end, p); + p += context_size; + } + + return decoder_input; +} + +/** Return a ragged shape with axes [utt][num_hyps]. + * + * @param hyps hyps.size() == batch_size. Each entry contains the active + * hypotheses of an utterance. + * @return Return a ragged shape with 2 axes [utt][num_hyps]. Note that the + * shape is on CPU. + */ +static k2::RaggedShapePtr GetHypsShape(const std::vector &hyps) { + int32_t num_utt = hyps.size(); + torch::Tensor row_splits = torch::empty( + {num_utt + 1}, + torch::dtype(torch::kInt).memory_format(torch::MemoryFormat::Contiguous)); + auto row_splits_acc = row_splits.accessor(); + for (int32_t i = 0; i != num_utt; ++i) { + row_splits_acc[i] = hyps[i].Size(); + } + + k2::ExclusiveSum(row_splits, &row_splits); + + return k2::RaggedShape2(row_splits, torch::Tensor(), row_splits_acc[num_utt]); +} + +std::vector +OfflineTransducerModifiedBeamSearchDecoder::Decode( + torch::Tensor encoder_out, torch::Tensor encoder_out_length, + OfflineStream **ss /*= nullptr*/, int32_t n /*= 0*/) { + TORCH_CHECK(encoder_out.dim() == 3, "encoder_out.dim() is ", + encoder_out.dim(), "Expected value is 3"); + TORCH_CHECK(encoder_out.scalar_type() == torch::kFloat, + "encoder_out.scalar_type() is ", encoder_out.scalar_type()); + + TORCH_CHECK(encoder_out_length.dim() == 1, "encoder_out_length.dim() is", + encoder_out_length.dim()); + TORCH_CHECK(encoder_out_length.scalar_type() == torch::kLong, + "encoder_out_length.scalar_type() is ", + encoder_out_length.scalar_type()); + + TORCH_CHECK(encoder_out_length.device().is_cpu()); + + torch::Device device = model_->Device(); + encoder_out = encoder_out.to(device); + + torch::nn::utils::rnn::PackedSequence packed_seq = + torch::nn::utils::rnn::pack_padded_sequence(encoder_out, + encoder_out_length, + /*batch_first*/ true, + /*enforce_sorted*/ false); + + auto packed_encoder_out = packed_seq.data(); + + int32_t blank_id = 0; + int32_t context_size = model_->ContextSize(); + + int32_t batch_size = encoder_out_length.size(0); + + if (ss != nullptr) SHERPA_CHECK_EQ(batch_size, n); + + std::vector blanks(context_size, -1); + blanks.back() = blank_id; + + Hypotheses blank_hyp({{blanks, 0}}); + + std::deque finalized; + std::vector cur; + std::vector prev; + + std::vector context_graphs(batch_size, nullptr); + + auto sorted_indices = packed_seq.sorted_indices().cpu(); + auto sorted_indices_accessor = sorted_indices.accessor(); + + for (int32_t i = 0; i < batch_size; ++i) { + const ContextState *context_state = nullptr; + if (ss != nullptr) { + context_graphs[i] = ss[sorted_indices_accessor[i]]->GetContextGraph(); + if (context_graphs[i] != nullptr) + context_state = context_graphs[i]->Root(); + } + Hypotheses blank_hyp({{blanks, 0, context_state}}); + cur.emplace_back(std::move(blank_hyp)); + } + + using torch::indexing::Slice; + auto batch_sizes_acc = packed_seq.batch_sizes().accessor(); + int32_t max_T = packed_seq.batch_sizes().numel(); + int32_t offset = 0; + + for (int32_t t = 0; t != max_T; ++t) { + int32_t cur_batch_size = batch_sizes_acc[t]; + int32_t start = offset; + int32_t end = start + cur_batch_size; + auto cur_encoder_out = packed_encoder_out.index({Slice(start, end)}); + offset = end; + + cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); + // Now cur_encoder_out's shape is (cur_batch_size, 1, 1, joiner_dim) + + if (cur_batch_size < static_cast(cur.size())) { + for (int32_t k = static_cast(cur.size()) - 1; + k >= cur_batch_size; --k) { + finalized.push_front(std::move(cur[k])); + } + cur.erase(cur.begin() + cur_batch_size, cur.end()); + } + + // Due to merging paths with identical token sequences, + // not all utterances have "num_active_paths" paths. + auto hyps_shape = GetHypsShape(cur); + int32_t num_hyps = k2::TotSize(hyps_shape, 1); + + prev.clear(); + prev.reserve(num_hyps); + for (auto &hyps : cur) { + for (auto &h : hyps) { + prev.push_back(std::move(h.second)); + } + } + cur.clear(); + cur.reserve(cur_batch_size); + + auto ys_log_probs = torch::empty({num_hyps, 1}, torch::kFloat); + + auto ys_log_probs_acc = ys_log_probs.accessor(); + for (int32_t k = 0; k != static_cast(prev.size()); ++k) { + ys_log_probs_acc[k][0] = prev[k].log_prob; + } + + auto decoder_input = BuildDecoderInput(prev, context_size).to(device); + + auto decoder_out = model_->RunDecoder(decoder_input); + // decoder_out is of shape (num_hyps, 1, joiner_dim) + + auto index = k2::RowIds(hyps_shape, 1).to(torch::kLong).to(device); + + cur_encoder_out = cur_encoder_out.index_select(/*dim*/ 0, /*index*/ index); + // cur_encoder_out is of shape (num_hyps, 1, 1, joiner_dim) + + auto logits = model_->RunJoiner(cur_encoder_out, decoder_out.unsqueeze(1)); + + // logits' shape is (num_hyps, 1, 1, vocab_size) + logits = logits.squeeze(1).squeeze(1); + // now logits' shape is (num_hyps, vocab_size) + + auto log_probs = (logits / temperature_).log_softmax(-1).cpu(); + + log_probs.add_(ys_log_probs); + + int32_t vocab_size = log_probs.size(1); + log_probs = log_probs.reshape(-1); + auto row_splits = k2::RowSplits(hyps_shape, 1); + auto row_splits_acc = row_splits.accessor(); + + for (int32_t k = 0; k != cur_batch_size; ++k) { + int32_t start = row_splits_acc[k]; + int32_t end = row_splits_acc[k + 1]; + + torch::Tensor values, indexes; + std::tie(values, indexes) = + log_probs.slice(/*dim*/ 0, start * vocab_size, end * vocab_size) + .topk(/*k*/ num_active_paths_, /*dim*/ 0, + /*largest*/ true, /*sorted*/ true); + + auto topk_hyp_indexes = FloorDivide(indexes, vocab_size); + auto topk_token_indexes = torch::remainder(indexes, vocab_size); + + auto values_acc = values.accessor(); + auto topk_hyp_indexes_acc = topk_hyp_indexes.accessor(); + auto topk_token_indexes_acc = topk_token_indexes.accessor(); + + Hypotheses hyps; + for (int32_t j = 0; j != values.numel(); ++j) { + int32_t hyp_idx = topk_hyp_indexes_acc[j]; + Hypothesis new_hyp = prev[start + hyp_idx]; // note: hyp_idx is 0 based + + int32_t new_token = topk_token_indexes_acc[j]; + + float context_score = 0; + auto context_state = new_hyp.context_state; + + if (new_token != blank_id) { + new_hyp.ys.push_back(new_token); + new_hyp.timestamps.push_back(t); + if (context_graphs[k] != nullptr) { + auto context_res = + context_graphs[k]->ForwardOneStep(context_state, new_token); + context_score = context_res.first; + new_hyp.context_state = context_res.second; + } + } + + // We already added log_prob of the path to log_probs before, so + // we use values_acc[j] here directly. + new_hyp.log_prob = values_acc[j] + context_score; + hyps.Add(std::move(new_hyp)); + } + cur.push_back(std::move(hyps)); + } + } + + for (auto &h : finalized) { + cur.push_back(std::move(h)); + } + + // Finalize context biasing matching.. + for (int32_t i = 0; i < static_cast(cur.size()); ++i) { + for (auto iter = cur[i].begin(); iter != cur[i].end(); ++iter) { + if (context_graphs[i] != nullptr) { + auto context_res = + context_graphs[i]->Finalize(iter->second.context_state); + iter->second.log_prob += context_res.first; + iter->second.context_state = context_res.second; + } + } + } + + auto unsorted_indices = packed_seq.unsorted_indices().cpu(); + auto unsorted_indices_accessor = unsorted_indices.accessor(); + + std::vector ans(batch_size); + for (int32_t i = 0; i != batch_size; ++i) { + int32_t k = unsorted_indices_accessor[i]; + Hypothesis hyp = cur[k].GetMostProbable(true); + torch::ArrayRef arr(hyp.ys); + ans[i].tokens = arr.slice(context_size).vec(); + ans[i].timestamps = std::move(hyp.timestamps); + } + + return ans; +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-transducer-modified-beam-search-decoder.h b/sherpa/csrc/offline-transducer-modified-beam-search-decoder.h new file mode 100644 index 000000000..7effc3b5b --- /dev/null +++ b/sherpa/csrc/offline-transducer-modified-beam-search-decoder.h @@ -0,0 +1,47 @@ +// sherpa/csrc/offline-transducer-modified-beam-search-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_ +#define SHERPA_CSRC_OFFLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_ + +#include + +#include "sherpa/cpp_api/offline-stream.h" +#include "sherpa/csrc/offline-transducer-decoder.h" +#include "sherpa/csrc/offline-transducer-model.h" + +namespace sherpa { + +class OfflineTransducerModifiedBeamSearchDecoder + : public OfflineTransducerDecoder { + public: + OfflineTransducerModifiedBeamSearchDecoder(OfflineTransducerModel *model, + int32_t num_active_paths, + float temperature) + : model_(model), + num_active_paths_(num_active_paths), + temperature_(temperature) {} + + /** Run modified beam search given the output from the encoder model. + * + * @param encoder_out A 3-D tensor of shape (N, T, joiner_dim) + * @param encoder_out_length A 1-D tensor of shape (N,) containing number + * of valid frames in encoder_out before padding. + * @param ss Pointer to an array of streams. + * @param n Size of the input array. + * + * @return Return a vector of size `N` containing the decoded results. + */ + std::vector Decode( + torch::Tensor encoder_out, torch::Tensor encoder_out_length, + OfflineStream **ss = nullptr, int32_t n = 0) override; + + private: + OfflineTransducerModel *model_; // Not owned + int32_t num_active_paths_; + float temperature_ = 1.0; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/offline-wav2vec2-ctc-model.cc b/sherpa/csrc/offline-wav2vec2-ctc-model.cc new file mode 100644 index 000000000..6dbfebe46 --- /dev/null +++ b/sherpa/csrc/offline-wav2vec2-ctc-model.cc @@ -0,0 +1,41 @@ +// sherpa/csrc/offline-wav2vec2-ctc-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-wav2vec2-ctc-model.h" + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OfflineWav2Vec2CtcModel::OfflineWav2Vec2CtcModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); +} + +torch::IValue OfflineWav2Vec2CtcModel::Forward(torch::Tensor waveforms, + torch::Tensor lengths) { + InferenceMode no_grad; + + return model_.run_method("forward", waveforms.to(device_), + lengths.to(device_)); +} + +torch::Tensor OfflineWav2Vec2CtcModel::GetLogSoftmaxOut( + torch::IValue forward_out) const { + InferenceMode no_grad; + + auto logit = forward_out.toTuple()->elements()[0].toTensor(); + return logit.log_softmax(-1); +} + +torch::Tensor OfflineWav2Vec2CtcModel::GetLogSoftmaxOutLength( + torch::IValue forward_out) const { + InferenceMode no_grad; + + return forward_out.toTuple()->elements()[1].toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-wav2vec2-ctc-model.h b/sherpa/csrc/offline-wav2vec2-ctc-model.h new file mode 100644 index 000000000..6f4a8e89c --- /dev/null +++ b/sherpa/csrc/offline-wav2vec2-ctc-model.h @@ -0,0 +1,55 @@ +// sherpa/csrc/offline-wav2vec2-ctc-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_WAV2VEC2_CTC_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_WAV2VEC2_CTC_MODEL_H_ + +#include +#include + +#include "sherpa/csrc/offline-ctc-model.h" +namespace sherpa { + +/** This class models the Conformer model from icefall. + * + * See + * https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/model.py#L11 + */ +class OfflineWav2Vec2CtcModel : public OfflineCtcModel { + public: + /** + * @param filename Path name of the torch script model. + * @param device The model will be moved to this device + */ + explicit OfflineWav2Vec2CtcModel(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::Device Device() const override { return device_; } + + int32_t SubsamplingFactor() const override { + // See Section 4.2 of + // https://arxiv.org/pdf/2006.11477.pdf + return 1; + } + + /** Run the forward method of the model. + * See + * https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/model.py#L90 + * for its documentation in Python. + */ + torch::IValue Forward(torch::Tensor waveforms, + torch::Tensor lengths) override; + + torch::Tensor GetLogSoftmaxOut(torch::IValue forward_out) const override; + + torch::Tensor GetLogSoftmaxOutLength( + torch::IValue forward_out) const override; + + private: + torch::Device device_; + torch::jit::Module model_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_WAV2VEC2_CTC_MODEL_H_ diff --git a/sherpa/csrc/offline-wenet-conformer-ctc-model.cc b/sherpa/csrc/offline-wenet-conformer-ctc-model.cc new file mode 100644 index 000000000..2d1926daf --- /dev/null +++ b/sherpa/csrc/offline-wenet-conformer-ctc-model.cc @@ -0,0 +1,47 @@ +// sherpa/csrc/offline-wenet-conformer-ctc-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-wenet-conformer-ctc-model.h" + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OfflineWenetConformerCtcModel::OfflineWenetConformerCtcModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); + + subsampling_factor_ = model_.run_method("subsampling_rate").toInt(); +} + +torch::IValue OfflineWenetConformerCtcModel::Forward( + torch::Tensor features, torch::Tensor features_length) { + InferenceMode no_grad; + + return model_.attr("encoder").toModule().run_method( + "forward", features.to(device_), features_length.to(device_)); +} + +torch::Tensor OfflineWenetConformerCtcModel::GetLogSoftmaxOut( + torch::IValue forward_out) const { + InferenceMode no_grad; + + auto logit = forward_out.toTuple()->elements()[0]; + return model_.attr("ctc") + .toModule() + .run_method("log_softmax", logit) + .toTensor(); +} + +torch::Tensor OfflineWenetConformerCtcModel::GetLogSoftmaxOutLength( + torch::IValue forward_out) const { + InferenceMode no_grad; + + auto mask = forward_out.toTuple()->elements()[1].toTensor(); + return mask.sum({1, 2}); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-wenet-conformer-ctc-model.h b/sherpa/csrc/offline-wenet-conformer-ctc-model.h new file mode 100644 index 000000000..eab5c3fdf --- /dev/null +++ b/sherpa/csrc/offline-wenet-conformer-ctc-model.h @@ -0,0 +1,53 @@ +// sherpa/csrc/offline-wenet-conformer-ctc-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_WENET_CONFORMER_CTC_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_WENET_CONFORMER_CTC_MODEL_H_ + +#include +#include + +#include "sherpa/csrc/offline-ctc-model.h" +namespace sherpa { + +/** This class models the Conformer model from wenet. + * + * See + * https://github.com/wenet-e2e/wenet/blob/main/wenet/transformer/asr_model.py + */ +class OfflineWenetConformerCtcModel : public OfflineCtcModel { + public: + /** + * @param filename Path name of the torch script model. + * @param device The model will be moved to this device + */ + explicit OfflineWenetConformerCtcModel(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::Device Device() const override { return device_; } + + int32_t SubsamplingFactor() const override { return subsampling_factor_; } + + /** Run the encoder of the model. + * + * See + * https://github.com/wenet-e2e/wenet/blob/main/wenet/transformer/asr_model.py#L42 + * for its documentation in Python. + */ + torch::IValue Forward(torch::Tensor features, + torch::Tensor features_length) override; + + torch::Tensor GetLogSoftmaxOut(torch::IValue forward_out) const override; + + torch::Tensor GetLogSoftmaxOutLength( + torch::IValue forward_out) const override; + + private: + torch::Device device_; + torch::jit::Module model_; + int32_t subsampling_factor_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_WENET_CONFORMER_CTC_MODEL_H_ diff --git a/sherpa/csrc/offline-whisper-model-config.cc b/sherpa/csrc/offline-whisper-model-config.cc new file mode 100644 index 000000000..81b5d8d0c --- /dev/null +++ b/sherpa/csrc/offline-whisper-model-config.cc @@ -0,0 +1,65 @@ +// sherpa/csrc/offline-whisper-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/offline-whisper-model-config.h" + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/macros.h" + +namespace sherpa { + +void OfflineWhisperModelConfig::Register(ParseOptions *po) { + po->Register("whisper-model", &model, + "Path to the torchscript model of whisper"); + + po->Register( + "whisper-language", &language, + "The spoken language in the input audio file. Example values: " + "en, de, fr, zh, jp. If it is not given for a multilingual model, we will" + " infer the language from the input audio file. " + "Please refer to " + "https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10" + " for valid values. Note that for non-multilingual models, it supports " + "only 'en'"); + + po->Register("whisper-task", &task, + "Valid values: transcribe, translate. " + "Note that for non-multilingual models, it supports " + "only 'transcribe'"); +} + +bool OfflineWhisperModelConfig::Validate() const { + if (model.empty()) { + SHERPA_LOGE("Please provide --whisper-model"); + return false; + } + + if (!FileExists(model)) { + SHERPA_LOGE("whisper model file '%s' does not exist", model.c_str()); + return false; + } + + if (task != "translate" && task != "transcribe") { + SHERPA_LOGE( + "--whisper-task supports only translate and transcribe. Given: %s", + task.c_str()); + + return false; + } + + return true; +} + +std::string OfflineWhisperModelConfig::ToString() const { + std::ostringstream os; + + os << "OfflineWhisperModelConfig("; + os << "model=\"" << model << "\", "; + os << "language=\"" << language << "\", "; + os << "task=\"" << task << "\")"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-whisper-model-config.h b/sherpa/csrc/offline-whisper-model-config.h new file mode 100644 index 000000000..ee6dbe1f9 --- /dev/null +++ b/sherpa/csrc/offline-whisper-model-config.h @@ -0,0 +1,44 @@ +// sherpa/csrc/offline-whisper-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_WHISPER_MODEL_CONFIG_H_ +#define SHERPA_CSRC_OFFLINE_WHISPER_MODEL_CONFIG_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" + +namespace sherpa { + +struct OfflineWhisperModelConfig { + std::string model; + + // Available languages can be found at + // https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10 + // + // Note: For non-multilingual models, it supports only "en" + // + // If empty, we will infer it from the input audio file when + // the model is multilingual. + std::string language; + + // Valid values are transcribe and translate + // + // Note: For non-multilingual models, it supports only "transcribe" + std::string task = "transcribe"; + + OfflineWhisperModelConfig() = default; + OfflineWhisperModelConfig(const std::string &model, + const std::string &language, + const std::string &task) + : model(model), language(language), task(task) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_WHISPER_MODEL_CONFIG_H_ diff --git a/sherpa/csrc/offline-whisper-model-meta-data.cc b/sherpa/csrc/offline-whisper-model-meta-data.cc new file mode 100644 index 000000000..d1be779c1 --- /dev/null +++ b/sherpa/csrc/offline-whisper-model-meta-data.cc @@ -0,0 +1,68 @@ +// sherpa/csrc/offline-whisper-model-meta-data.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/offline-whisper-model-meta-data.h" + +#include +#include +#include + +namespace sherpa { + +std::string OfflineWhisperModelMetaData::ToString() const { + std::ostringstream os; + + os << "----------whisper meta data----------\n"; + + os << " comment: " << comment << "\n"; + os << " n_mels: " << n_mels << "\n"; + os << " n_audio_ctx: " << n_audio_ctx << "\n"; + os << " n_audio_state: " << n_audio_state << "\n"; + os << " n_audio_head: " << n_audio_head << "\n"; + os << " n_audio_layer: " << n_audio_layer << "\n"; + os << " n_vocab: " << n_vocab << "\n"; + os << " n_text_ctx: " << n_text_ctx << "\n"; + os << " n_text_state: " << n_text_state << "\n"; + os << " n_text_head: " << n_text_head << "\n"; + os << " n_text_layer: " << n_text_layer << "\n"; + os << " sot: " << sot << "\n"; + os << " sot_index: " << sot_index << "\n"; + os << " eot: " << eot << "\n"; + os << " blank_id: " << blank_id << "\n"; + os << " is_multilingual: " << is_multilingual << "\n"; + os << " no_speech: " << no_speech << "\n"; + os << " non_speech_tokens: " << non_speech_tokens << "\n"; + os << " transcribe: " << transcribe << "\n"; + os << " translate: " << translate << "\n"; + os << " sot_prev: " << sot_prev << "\n"; + os << " sot_lm: " << sot_lm << "\n"; + os << " no_timestamps: " << no_timestamps << "\n"; + os << " sot_sequence:"; + for (auto i : sot_sequence) { + os << " " << i; + } + os << "\n"; + + std::vector langs; + langs.reserve(lang2id.size()); + for (const auto &p : lang2id) { + langs.push_back(p.first); + } + std::sort(langs.begin(), langs.end()); + + os << " lang2id: (" << lang2id.size() << ")" << "\n "; + int32_t k = 0; + for (const auto &lang : langs) { + os << lang << " -> " << lang2id.at(lang) << ", "; + k += 1; + if (k % 10 == 0) { + os << "\n "; + } + } + os << "\n"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-whisper-model-meta-data.h b/sherpa/csrc/offline-whisper-model-meta-data.h new file mode 100644 index 000000000..b5df0cedd --- /dev/null +++ b/sherpa/csrc/offline-whisper-model-meta-data.h @@ -0,0 +1,50 @@ +// sherpa/csrc/offline-whisper-model-meta-data.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_WHISPER_MODEL_META_DATA_H_ +#define SHERPA_CSRC_OFFLINE_WHISPER_MODEL_META_DATA_H_ + +#include +#include +#include + +#include "torch/script.h" + +namespace sherpa { + +struct OfflineWhisperModelMetaData { + int32_t n_mels; + int32_t n_audio_ctx; + int32_t n_audio_state; + int32_t n_audio_head; + int32_t n_audio_layer; + int32_t n_vocab; + int32_t n_text_ctx; + int32_t n_text_state; + int32_t n_text_head; + int32_t n_text_layer; + int32_t sot; + int32_t sot_index; + int32_t eot; + int32_t blank_id; + int32_t is_multilingual; + int32_t no_speech; + int32_t non_speech_tokens; + int32_t transcribe; + int32_t translate; + int32_t sot_prev; + int32_t sot_lm; + int32_t no_timestamps; + + std::string comment; + std::vector sot_sequence; + std::unordered_map lang2id; + std::unordered_map id2lang; + std::vector all_languages_id; + + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_WHISPER_MODEL_META_DATA_H_ diff --git a/sherpa/csrc/offline-whisper-model.cc b/sherpa/csrc/offline-whisper-model.cc new file mode 100644 index 000000000..55b5ed424 --- /dev/null +++ b/sherpa/csrc/offline-whisper-model.cc @@ -0,0 +1,243 @@ +// sherpa/csrc/offline-whisper-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/offline-whisper-model.h" + +#include +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/offline-whisper-model-meta-data.h" +#include "sherpa/csrc/text-utils.h" +namespace sherpa { + +class OfflineWhisperModel::Impl { + public: + explicit Impl(const OfflineModelConfig &config) { + torch::jit::ExtraFilesMap meta_data{ + {"model_type", {}}, + {"comment", {}}, + {"version", {}}, + {"maintainer", {}}, + {"n_mels", {}}, + {"n_audio_ctx", {}}, + {"n_audio_state", {}}, + {"n_audio_head", {}}, + {"n_audio_layer", {}}, + {"n_vocab", {}}, + {"n_text_ctx", {}}, + {"n_text_state", {}}, + {"n_text_head", {}}, + {"n_text_layer", {}}, + {"sot_sequence", {}}, + {"all_language_tokens", {}}, + {"all_language_codes", {}}, + {"sot", {}}, + {"sot_index", {}}, + {"eot", {}}, + {"blank_id", {}}, + {"is_multilingual", {}}, + {"no_speech", {}}, + {"non_speech_tokens", {}}, + {"transcribe", {}}, + {"translate", {}}, + {"sot_prev", {}}, + {"sot_lm", {}}, + {"no_timestamps", {}}, + }; + + if (config.use_gpu) { + device_ = torch::Device{torch::kCUDA}; + } + + model_ = torch::jit::load(config.whisper.model, device_, meta_data); + model_.eval(); + + if (meta_data.at("model_type") != "whisper" && + meta_data.at("model_type") != "Whisper") { + SHERPA_LOGE("Expect a whisper model. Given: '%s'", + meta_data.at("model_type").c_str()); + SHERPA_EXIT(-1); + } + + InitMetaData(meta_data); + + if (config.debug) { + SHERPA_LOGE("%s", meta_data_.ToString().c_str()); + } + } + + const OfflineWhisperModelMetaData &GetModelMetadata() const { + return meta_data_; + } + + torch::Device Device() const { return device_; } + + std::pair RunEncoder( + const torch::Tensor &features) { + InferenceMode no_grad; + + auto outputs = model_.run_method("run_encoder", features).toTuple(); + + auto n_layer_cross_k_cache = outputs->elements()[0].toTensor(); + auto n_layer_cross_v_cache = outputs->elements()[1].toTensor(); + + return {n_layer_cross_k_cache, n_layer_cross_v_cache}; + } + + std::tuple RunDecoder( + const torch::Tensor &tokens, torch::Tensor n_layer_self_k_cache, + torch::Tensor n_layer_self_v_cache, torch::Tensor n_layer_cross_k_cache, + torch::Tensor n_layer_cross_v_cache, const torch::Tensor &offset) { + InferenceMode no_grad; + + auto outputs = model_ + .run_method("run_decoder", tokens, n_layer_self_k_cache, + n_layer_self_v_cache, n_layer_cross_k_cache, + n_layer_cross_v_cache, offset) + .toTuple(); + + auto logits = outputs->elements().vec()[0].toTensor(); + n_layer_self_k_cache = outputs->elements().vec()[1].toTensor(); + n_layer_self_v_cache = outputs->elements().vec()[2].toTensor(); + + return std::make_tuple(logits, n_layer_self_k_cache, n_layer_self_v_cache); + } + + torch::Tensor DetectLanguage(const torch::Tensor &n_layer_cross_k_cache, + const torch::Tensor &n_layer_cross_v_cache) { + InferenceMode no_grad; + + int32_t batch_size = n_layer_cross_v_cache.size(1); + torch::Tensor tokens = + torch::tensor({meta_data_.sot}, + torch::dtype(torch::kInt).device(device_)) + .unsqueeze(0) + .repeat({batch_size, 1}); + + torch::Tensor offset = + torch::zeros({batch_size}, torch::dtype(torch::kInt).device(device_)); + + torch::Tensor n_layer_self_k_cache = + torch::zeros({meta_data_.n_text_layer, batch_size, + meta_data_.n_text_ctx, meta_data_.n_text_state}, + torch::dtype(torch::kFloat).device(device_)); + + torch::Tensor n_layer_self_v_cache = + torch::zeros({meta_data_.n_text_layer, batch_size, + meta_data_.n_text_ctx, meta_data_.n_text_state}, + torch::dtype(torch::kFloat).device(device_)); + + auto out = RunDecoder(tokens, n_layer_self_k_cache, n_layer_self_v_cache, + n_layer_cross_k_cache, n_layer_cross_v_cache, offset); + auto logits = std::get<0>(out); + + torch::Tensor all_languages_id = + torch::tensor(meta_data_.all_languages_id, + torch::dtype(torch::kLong).device(device_)); + torch::Tensor mask = + torch::ones(logits.size(2), torch::dtype(torch::kLong).device(device_)); + + mask.index_put_({all_languages_id}, 0); + + torch::Tensor non_language_indexes = mask.nonzero().squeeze(); + + logits.index_put_({"...", non_language_indexes}, + -std::numeric_limits::infinity()); + + return logits.argmax(-1).squeeze(); + } + + private: + void InitMetaData(const torch::jit::ExtraFilesMap &meta_data) { + meta_data_.comment = meta_data.at("comment"); + meta_data_.n_mels = atoi(meta_data.at("n_mels").c_str()); + meta_data_.n_audio_ctx = atoi(meta_data.at("n_audio_ctx").c_str()); + meta_data_.n_audio_state = atoi(meta_data.at("n_audio_state").c_str()); + meta_data_.n_audio_head = atoi(meta_data.at("n_audio_head").c_str()); + meta_data_.n_audio_layer = atoi(meta_data.at("n_audio_layer").c_str()); + meta_data_.n_vocab = atoi(meta_data.at("n_vocab").c_str()); + meta_data_.n_text_ctx = atoi(meta_data.at("n_text_ctx").c_str()); + meta_data_.n_text_state = atoi(meta_data.at("n_text_state").c_str()); + meta_data_.n_text_head = atoi(meta_data.at("n_text_head").c_str()); + meta_data_.n_text_layer = atoi(meta_data.at("n_text_layer").c_str()); + meta_data_.sot = atoi(meta_data.at("sot").c_str()); + meta_data_.sot_index = atoi(meta_data.at("sot_index").c_str()); + meta_data_.eot = atoi(meta_data.at("eot").c_str()); + meta_data_.blank_id = atoi(meta_data.at("blank_id").c_str()); + meta_data_.is_multilingual = atoi(meta_data.at("is_multilingual").c_str()); + meta_data_.no_speech = atoi(meta_data.at("no_speech").c_str()); + meta_data_.non_speech_tokens = + atoi(meta_data.at("non_speech_tokens").c_str()); + meta_data_.transcribe = atoi(meta_data.at("transcribe").c_str()); + meta_data_.translate = atoi(meta_data.at("translate").c_str()); + meta_data_.sot_prev = atoi(meta_data.at("sot_prev").c_str()); + meta_data_.sot_lm = atoi(meta_data.at("sot_lm").c_str()); + meta_data_.no_timestamps = atoi(meta_data.at("no_timestamps").c_str()); + + std::vector all_language_codes; + SplitStringToIntegers(meta_data.at("sot_sequence"), ",", true, + &meta_data_.sot_sequence); + + SplitStringToVector(meta_data.at("all_language_codes"), ",", true, + &all_language_codes); + + SplitStringToIntegers(meta_data.at("all_language_tokens"), ",", true, + &meta_data_.all_languages_id); + + for (int32_t i = 0; i < static_cast(all_language_codes.size()); + ++i) { + meta_data_.lang2id[all_language_codes[i]] = + meta_data_.all_languages_id[i]; + + meta_data_.id2lang[meta_data_.all_languages_id[i]] = + std::move(all_language_codes[i]); + } + } + + private: + torch::jit::Module model_; + OfflineWhisperModelMetaData meta_data_; + torch::Device device_{torch::kCPU}; +}; + +OfflineWhisperModel::OfflineWhisperModel(const OfflineModelConfig &config) + : impl_(std::make_unique(config)) {} + +OfflineWhisperModel::~OfflineWhisperModel() = default; + +const OfflineWhisperModelMetaData &OfflineWhisperModel::GetModelMetadata() + const { + return impl_->GetModelMetadata(); +} + +torch::Device OfflineWhisperModel::Device() const { return impl_->Device(); } + +std::pair OfflineWhisperModel::RunEncoder( + const torch::Tensor &features) const { + return impl_->RunEncoder(features); +} + +std::tuple +OfflineWhisperModel::RunDecoder(const torch::Tensor &tokens, + const torch::Tensor &n_layer_self_k_cache, + const torch::Tensor &n_layer_self_v_cache, + const torch::Tensor &n_layer_cross_k_cache, + const torch::Tensor &n_layer_cross_v_cache, + const torch::Tensor &offset) const { + return impl_->RunDecoder(tokens, n_layer_self_k_cache, n_layer_self_v_cache, + n_layer_cross_k_cache, n_layer_cross_v_cache, + offset); +} + +torch::Tensor OfflineWhisperModel::DetectLanguage( + const torch::Tensor &n_layer_cross_k_cache, + const torch::Tensor &n_layer_cross_v_cache) const { + return impl_->DetectLanguage(n_layer_cross_k_cache, n_layer_cross_v_cache); +} + +} // namespace sherpa diff --git a/sherpa/csrc/offline-whisper-model.h b/sherpa/csrc/offline-whisper-model.h new file mode 100644 index 000000000..c4045a30d --- /dev/null +++ b/sherpa/csrc/offline-whisper-model.h @@ -0,0 +1,68 @@ +// sherpa/csrc/offline-whisper-model.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_OFFLINE_WHISPER_MODEL_H_ +#define SHERPA_CSRC_OFFLINE_WHISPER_MODEL_H_ + +#include +#include +#include + +#include "sherpa/csrc/offline-model-config.h" +#include "sherpa/csrc/offline-whisper-model-meta-data.h" +#include "torch/script.h" + +namespace sherpa { + +class OfflineWhisperModel { + public: + explicit OfflineWhisperModel(const OfflineModelConfig &config); + + ~OfflineWhisperModel(); + + const OfflineWhisperModelMetaData &GetModelMetadata() const; + + torch::Device Device() const; + + /** + * @params features 3-D tensor of shape (N, C, T). + * @returns Return two tensors: + * - n_layer_cross_k_cache, 4-D tensor (num_layers, N, T, C) + * - n_layer_cross_v_cache, 4-D tensor (num_layers, N, T, C) + */ + std::pair RunEncoder( + const torch::Tensor &features) const; + + /* + * + * @params tokens A 2-D tensor of shape (N, num_tokens) + * @param n_layer_self_k_cache (num_layers, N, dim1, dim2) + * @param n_layer_self_v_cache (num_layers, N, dim1, dim2) + * @param n_layer_cross_k_cache (num_layers, N, T, dim) + * @param n_layer_cross_v_cache (num_layers, N, T, dim) + * @param offset A 1-D int32 tensor of shape (N,) + * + * @returns Return a tuple of 3 tensors: + * - logits, (N, num_tokens, dim) + * - n_layer_self_k_cache, (num_layers, batch-size, dim1, dim2) + * - n_layer_self_v_cache, (num_layers, batch-size, dim1, dim2) + */ + std::tuple RunDecoder( + const torch::Tensor &tokens, const torch::Tensor &n_layer_self_k_cache, + const torch::Tensor &n_layer_self_v_cache, + const torch::Tensor &n_layer_cross_k_cache, + const torch::Tensor &n_layer_cross_v_cache, + const torch::Tensor &offset) const; + + torch::Tensor DetectLanguage( + const torch::Tensor &n_layer_cross_k_cache, + const torch::Tensor &n_layer_cross_v_cache) const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_OFFLINE_WHISPER_MODEL_H_ diff --git a/sherpa/csrc/offline_asr.cc b/sherpa/csrc/offline_asr.cc deleted file mode 100644 index 9a175bcb4..000000000 --- a/sherpa/csrc/offline_asr.cc +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "sherpa/csrc/offline_asr.h" - -#include - -#include "sherpa/csrc/fbank_features.h" -#include "sherpa/csrc/file_utils.h" -#include "sherpa/csrc/log.h" -#include "sherpa/csrc/rnnt_beam_search.h" - -namespace sherpa { - -static void RegisterFrameExtractionOptions( - ParseOptions *po, kaldifeat::FrameExtractionOptions *opts) { - po->Register("sample-frequency", &opts->samp_freq, - "Waveform data sample frequency (must match the waveform file, " - "if specified there)"); - - po->Register("frame-length", &opts->frame_length_ms, - "Frame length in milliseconds"); - - po->Register("frame-shift", &opts->frame_shift_ms, - "Frame shift in milliseconds"); - - po->Register( - "dither", &opts->dither, - "Dithering constant (0.0 means no dither). " - "Caution: Samples are normalized to the range [-1, 1). " - "Please select a small value for dither if you want to enable it"); -} - -static void RegisterMelBanksOptions(ParseOptions *po, - kaldifeat::MelBanksOptions *opts) { - po->Register("num-mel-bins", &opts->num_bins, - "Number of triangular mel-frequency bins"); -} - -void OfflineAsrOptions::Register(ParseOptions *po) { - po->Register("nn-model", &nn_model, "Path to the torchscript model"); - - po->Register("tokens", &tokens, "Path to tokens.txt."); - - po->Register("decoding-method", &decoding_method, - "Decoding method to use. Possible values are: greedy_search, " - "modified_beam_search"); - - po->Register("num-active-paths", &num_active_paths, - "Number of active paths for modified_beam_search. " - "Used only when --decoding-method is modified_beam_search"); - - po->Register("use-gpu", &use_gpu, - "true to use GPU for computation. false to use CPU.\n" - "If true, it uses the first device. You can use the environment " - "variable CUDA_VISIBLE_DEVICES to select which device to use."); - - fbank_opts.frame_opts.dither = 0; - RegisterFrameExtractionOptions(po, &fbank_opts.frame_opts); - - fbank_opts.mel_opts.num_bins = 80; - RegisterMelBanksOptions(po, &fbank_opts.mel_opts); -} - -void OfflineAsrOptions::Validate() const { - if (nn_model.empty()) { - SHERPA_LOG(FATAL) << "Please provide --nn-model"; - } - - if (!FileExists(nn_model)) { - SHERPA_LOG(FATAL) << "\n--nn-model=" << nn_model << "\n" - << nn_model << " does not exist!"; - } - - if (tokens.empty()) { - SHERPA_LOG(FATAL) << "Please provide --tokens"; - } - - if (!FileExists(tokens)) { - SHERPA_LOG(FATAL) << "\n--tokens=" << tokens << "\n" - << tokens << " does not exist!"; - } - - if (decoding_method != "greedy_search" && - decoding_method != "modified_beam_search") { - SHERPA_LOG(FATAL) - << "Unsupported decoding method: " << decoding_method - << ". Supported values are: greedy_search, modified_beam_search"; - } - - if (decoding_method == "modified_beam_search") { - SHERPA_CHECK_GT(num_active_paths, 0); - } -} - -std::string OfflineAsrOptions::ToString() const { - std::ostringstream os; - os << "--nn-model=" << nn_model << "\n"; - os << "--tokens=" << tokens << "\n"; - - os << "--decoding-method=" << decoding_method << "\n"; - - if (decoding_method == "modified_beam_search") { - os << "--num-active-paths=" << num_active_paths << "\n"; - } - - os << "--use-gpu=" << std::boolalpha << use_gpu << "\n"; - - return os.str(); -} - -OfflineAsr::OfflineAsr(const OfflineAsrOptions &opts) - : opts_(opts), - model_(opts.nn_model, - opts.use_gpu ? torch::Device("cuda:0") : torch::Device("cpu")), - sym_(opts.tokens), - fbank_(opts.fbank_opts) {} - -std::vector OfflineAsr::DecodeWaves( - const std::vector &filenames, float expected_sample_rate) { - std::vector waves; - for (const auto &f : filenames) { - waves.push_back(ReadWave(f, expected_sample_rate).first); - } - - return DecodeWaves(waves); -} - -std::vector OfflineAsr::DecodeWaves( - const std::vector &waves) { - std::vector features = ComputeFeatures(fbank_, waves); - return DecodeFeatures(features); -} - -std::vector OfflineAsr::DecodeFeatures( - const std::vector &features) { - torch::Tensor padded_features = torch::nn::utils::rnn::pad_sequence( - features, /*batch_first*/ true, - /*padding_value*/ -23.025850929940457f); - - std::vector feature_length_vec(features.size()); - for (size_t i = 0; i != features.size(); ++i) { - feature_length_vec[i] = features[i].size(0); - } - - torch::Tensor feature_lengths = torch::tensor(feature_length_vec); - - return DecodeFeatures(padded_features, feature_lengths); -} - -std::vector OfflineAsr::DecodeFeatures( - torch::Tensor features, torch::Tensor features_length) { - auto device = model_.Device(); - features = features.to(device); - features_length = features_length.to(device).to(torch::kLong); - - torch::Tensor encoder_out; - torch::Tensor encoder_out_length; - - std::tie(encoder_out, encoder_out_length) = - model_.ForwardEncoder(features, features_length); - encoder_out_length = encoder_out_length.cpu(); - - std::vector> token_ids; - - if (opts_.decoding_method == "greedy_search") { - token_ids = GreedySearch(model_, encoder_out, encoder_out_length); - } else if (opts_.decoding_method == "modified_beam_search") { - token_ids = ModifiedBeamSearch(model_, encoder_out, encoder_out_length, - opts_.num_active_paths); - } else { - SHERPA_LOG(FATAL) << "Unsupported decoding method: " - << opts_.decoding_method; - } - - int32_t batch_size = features.size(0); - std::vector results(batch_size); - for (int32_t i = 0; i != batch_size; ++i) { - auto &text = results[i].text; - for (auto t : token_ids[i]) { - text += sym_[t]; - } - } - - return results; -} - -} // namespace sherpa diff --git a/sherpa/csrc/offline_asr.h b/sherpa/csrc/offline_asr.h deleted file mode 100644 index dc19fa186..000000000 --- a/sherpa/csrc/offline_asr.h +++ /dev/null @@ -1,196 +0,0 @@ -/** - * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CSRC_OFFLINE_ASR_H_ -#define SHERPA_CSRC_OFFLINE_ASR_H_ - -#include -#include - -#include "kaldifeat/csrc/feature-fbank.h" -#include "sherpa/csrc/parse_options.h" -#include "sherpa/csrc/rnnt_conformer_model.h" -#include "sherpa/csrc/symbol_table.h" - -namespace sherpa { - -struct OfflineAsrOptions { - /// Path to torchscript model - std::string nn_model; - - /// Path to tokens.txt. - /// Each line the tokens.txt consists of two columms separated by a space: - /// - column 1: symbol - /// - column 2: integer ID of the symbol - std::string tokens; - - /// Decoding method to use. - /// Possible values are: greedy_search, modified_beam_search. - std::string decoding_method = "greedy_search"; - - /// Number of active paths in modified_beam_search. - /// Used only when decoding_method is modified_beam_search. - int32_t num_active_paths = 4; - - // true to use GPU for computation. Always selects the first device. - // false to use CPU. - // Note: Only neural network computation and decoding are done on CPU. - // Feature extraction is performed on CPU. - bool use_gpu = false; - - kaldifeat::FbankOptions fbank_opts; - - void Register(ParseOptions *po); - - // Check that option values are valid - void Validate() const; - - // For debugging - std::string ToString() const; -}; - -struct OfflineAsrResult { - // Decoded results. - // For English, it consists of space separated words. - // For Chinese, it consists of Chinese words without spaces. - std::string text; - - // Decoded results at the token level. - // For BPE-based models, it consists of a list of BPE tokens. - std::vector tokens; - - // timestamps.size() == tokens.size() - // timestamps[i] records the frame number on which tokens[i] is decoded. - // Frame numbers are counted after model subsampling. - std::vector timestamps; -}; - -class OfflineAsr { - public: - explicit OfflineAsr(const OfflineAsrOptions &opts); - - /** Decode a single wave file. - * - * If the input wave has multiple channels, only the first channel is used - * for decoding. - * - * @param filename Path to the wave file. Note: We only support "*.wav" - * format. - * @param expected_sample_rate Expected sample rate of the input wave file. - * If the input wave file has a different sample - * rate from this value, it will abort. - * - * @return Return the recognition result. - */ - OfflineAsrResult DecodeWave(const std::string &filename, - float expected_sample_rate) { - return DecodeWaves({filename}, expected_sample_rate)[0]; - } - - /** Decode a batch of wave files in parallel. - * - * If an input wave has multiple channels, only the first channel is used - * for decoding. - * - * @param filenames A list of wave filenames. We only support "*.wav" at - * present. - * @param expected_sample_rate Expected sample rate of each input wave file. - * If an input wave file has a different sample - * rate from this value, it will abort. - * - * @return Return the recognition results. ans[i] contains the recognition - * result for filenames[i] - */ - std::vector DecodeWaves( - const std::vector &filenames, float expected_sample_rate); - - /** Decode audio samples. - * - * @param wave A 1-D torch.float32 tensor containing audio samples, which are - * normalized to the range [-1, 1). Its sample rate must match - * the one for the training data that is used to train the model. - * It is 16 kHz for all models trained by icefall. - * - * @return Return the recognition result. - */ - OfflineAsrResult DecodeWave(torch::Tensor wave) { - return DecodeWaves({wave})[0]; - } - - /** Decode a batch of audio samples in parallel. - * - * @param waves Each entry is a 1-D torch.float32 tensor containing audio - * samples, which are normalized to the range [-1, 1). - * Its sample rate must match the one for the training data that - * is used to train the model. It is 16 kHz for all models - * trained by icefall. - * - * @return Return the recognition result. ans[i] is the recognition result - * for wave[i]. - */ - std::vector DecodeWaves( - const std::vector &waves); - - /** Decode input fbank feature. - * - * @param feature A 2-D tensor containing the fbank feature of a wave. Its - * number of rows equals to the number of feature frames and - * the number of columns equals to the feature dimension. - * - * @return Return the recognition result. - */ - OfflineAsrResult DecodeFeature(torch::Tensor feature) { - return DecodeFeatures({feature})[0]; - } - - /** Decode a batch of input fbank features in parallel. - * - * @param features Each entry is a 2-D tensor containing the fbank feature - * of a wave. Its number of rows equals to the number of - * feature frames and the number of columns equals to the - * feature dimension. - * - * @return Return the recognition result. ans[i] contains the recognition - * result for features[i]. - */ - std::vector DecodeFeatures( - const std::vector &features); - - /** Decode from pre-computed features. - * - * @param features A 3-D tensor of shape (N, T, C) containing pre-computed - * features. - * @param features_length A 1-D tensor of shape (N,) containing number of - * valid feature frames in `features` before padding. - * - * @return Return the recognition result. ans[i] contains the recognition - * result for features[i]. - */ - std::vector DecodeFeatures(torch::Tensor features, - torch::Tensor features_length); - - private: - OfflineAsrOptions opts_; - RnntConformerModel model_; - SymbolTable sym_; - - kaldifeat::Fbank fbank_; // always on CPU -}; - -} // namespace sherpa - -#endif // SHERPA_CSRC_OFFLINE_ASR_H_ diff --git a/sherpa/csrc/online-conformer-transducer-model.cc b/sherpa/csrc/online-conformer-transducer-model.cc new file mode 100644 index 000000000..1512d07d0 --- /dev/null +++ b/sherpa/csrc/online-conformer-transducer-model.cc @@ -0,0 +1,146 @@ +// sherpa/csrc/online-conformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-conformer-transducer-model.h" + +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OnlineConformerTransducerModel::OnlineConformerTransducerModel( + const std::string &filename, int32_t left_context, int32_t right_context, + int32_t decode_chunk_size, torch::Device device /*= torch::kCPU*/) + : device_(device), + left_context_(left_context), + right_context_(right_context) { + model_ = torch::jit::load(filename, device); + model_.eval(); + + encoder_ = model_.attr("encoder").toModule(); + decoder_ = model_.attr("decoder").toModule(); + joiner_ = model_.attr("joiner").toModule(); + + encoder_proj_ = joiner_.attr("encoder_proj").toModule(); + decoder_proj_ = joiner_.attr("decoder_proj").toModule(); + + int32_t subsampling_factor = encoder_.attr("subsampling_factor").toInt(); + + context_size_ = decoder_.attr("context_size").toInt(); + + // We add 3 here since the subsampling method is using + // ((len - 1) // 2 - 1) // 2) + // We plus 2 here because we will cut off one frame on each side + // of encoder_embed output (in conformer.py) to avoid a training + // and decoding mismatch by seeing padding values. + int32_t pad_length = + 2 * subsampling_factor + right_context + (subsampling_factor - 1); + chunk_shift_ = decode_chunk_size; + chunk_size_ = chunk_shift_ + pad_length; + // Note: Differences from the conv-emformer: + // right_context in streaming conformer is specified by users during + // decoding and it is a value before subsampling. +} + +torch::IValue OnlineConformerTransducerModel::StateToIValue( + const State &s) const { + return torch::IValue(s); +} + +OnlineConformerTransducerModel::State +OnlineConformerTransducerModel::StateFromIValue(torch::IValue ivalue) const { + torch::List list = ivalue.toList(); + + return {list.get(0).toTensor(), list.get(1).toTensor()}; +} + +torch::IValue OnlineConformerTransducerModel::StackStates( + const std::vector &states) const { + int32_t batch_size = states.size(); + std::vector attn; + std::vector conv; + attn.reserve(batch_size); + conv.reserve(batch_size); + + for (const auto &s : states) { + torch::List list = s.toList(); + attn.push_back(list.get(0).toTensor()); + conv.push_back(list.get(1).toTensor()); + } + torch::Tensor stacked_attn = torch::stack(attn, /*dim*/ 2); + torch::Tensor stacked_conv = torch::stack(conv, /*dim*/ 2); + + return torch::List({stacked_attn, stacked_conv}); +} + +std::vector OnlineConformerTransducerModel::UnStackStates( + torch::IValue ivalue) const { + State states = StateFromIValue(ivalue); + int32_t batch_size = states[0].size(2); + std::vector ans; + ans.reserve(batch_size); + + auto unstacked_attn = torch::unbind(states[0], /*dim*/ 2); + auto unstacked_conv = torch::unbind(states[1], /*dim*/ 2); + for (int32_t i = 0; i != batch_size; ++i) { + auto attn = unstacked_attn[i]; + auto conv = unstacked_conv[i]; + ans.push_back(StateToIValue({attn, conv})); + } + + return ans; +} + +torch::IValue OnlineConformerTransducerModel::GetEncoderInitStates( + int32_t /*unused=1*/) { + InferenceMode no_grad; + return encoder_.run_method("get_init_state", left_context_, device_); +} + +std::tuple +OnlineConformerTransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) { + InferenceMode no_grad; + + auto outputs = + encoder_ + .run_method("streaming_forward", features, features_length, states, + num_processed_frames, left_context_, right_context_) + .toTuple(); + + torch::IValue encoder_out = outputs->elements()[0]; + auto encoder_out_length = outputs->elements()[1].toTensor(); + + auto next_states = outputs->elements()[2]; + + auto projected_encoder_out = + encoder_proj_.run_method("forward", encoder_out).toTensor(); + + return std::make_tuple(projected_encoder_out, encoder_out_length, + next_states); +} + +torch::Tensor OnlineConformerTransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + auto decoder_out = + decoder_.run_method("forward", decoder_input, /*need_pad*/ false); + + return decoder_proj_.run_method("forward", decoder_out).toTensor(); +} + +torch::Tensor OnlineConformerTransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_ + .run_method("forward", encoder_out, decoder_out, + /*project_input*/ false) + .toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-conformer-transducer-model.h b/sherpa/csrc/online-conformer-transducer-model.h new file mode 100644 index 000000000..cfe9fb803 --- /dev/null +++ b/sherpa/csrc/online-conformer-transducer-model.h @@ -0,0 +1,101 @@ +// sherpa/csrc/online-conformer-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_CONFORMER_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_CONFORMER_TRANSDUCER_MODEL_H_ + +#include +#include +#include + +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { + +/** This class implements models from pruned_transducer_stateless{2,3,4,5} + * from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/model.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OnlineConformerTransducerModel : public OnlineTransducerModel { + public: + /** Constructor. + * + * @param filename Path to the torchscript model. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/export.py + * for how to export a model. + * @param left_context A value after subsampling + * @param right_context A value after subsampling + * @param decode_chunk_size A value after subsampling + * @param device Move the model to this device on loading. + */ + OnlineConformerTransducerModel(const std::string &filename, + int32_t left_context, int32_t right_context, + int32_t decode_chunk_size, + torch::Device device = torch::kCPU); + + torch::IValue StackStates( + const std::vector &states) const override; + + std::vector UnStackStates(torch::IValue states) const override; + + torch::IValue GetEncoderInitStates(int32_t unused = 1) override; + + std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) override; + + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + int32_t ContextSize() const override { return context_size_; } + + int32_t ChunkSize() const override { return chunk_size_; } + + int32_t ChunkShift() const override { return chunk_shift_; } + + // Non virtual methods that used by Python bindings. + + // See + // https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py#L200 + // for what state contains for details. + // A vector contains two tensors: + // - a 3-d tensor: (num_encoder_layers, left_context, encoder_dim) + // - a 3-d tensor: (num_encoder_layers, cnn_module_kernel - 1, encoder_dim) + using State = std::vector; + torch::IValue StateToIValue(const State &s) const; + State StateFromIValue(torch::IValue ivalue) const; + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + torch::jit::Module encoder_proj_; + torch::jit::Module decoder_proj_; + + torch::Device device_{"cpu"}; + int32_t left_context_; // after subsampling + int32_t right_context_; // after subsampling + + int32_t context_size_; + int32_t chunk_size_; + int32_t chunk_shift_; + + private: +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_CONFORMER_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/online-conv-emformer-transducer-model.cc b/sherpa/csrc/online-conv-emformer-transducer-model.cc new file mode 100644 index 000000000..84108ed0e --- /dev/null +++ b/sherpa/csrc/online-conv-emformer-transducer-model.cc @@ -0,0 +1,250 @@ +// sherpa/csrc/online-conv-emformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-conv-emformer-transducer-model.h" + +#include +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OnlineConvEmformerTransducerModel::OnlineConvEmformerTransducerModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); + + encoder_ = model_.attr("encoder").toModule(); + decoder_ = model_.attr("decoder").toModule(); + joiner_ = model_.attr("joiner").toModule(); + + encoder_proj_ = joiner_.attr("encoder_proj").toModule(); + decoder_proj_ = joiner_.attr("decoder_proj").toModule(); + + context_size_ = decoder_.attr("context_size").toInt(); + + auto chunk_length = encoder_.attr("chunk_length").toInt(); + + auto right_context_length = encoder_.attr("right_context_length").toInt(); + // Add 2 here since we will drop the first and last frame after subsampling; + // Add 3 here since the subsampling is ((len - 1) // 2 - 1) // 2. + auto pad_length = right_context_length + + 2 * encoder_.attr("subsampling_factor").toInt() + 3; + + chunk_size_ = chunk_length + pad_length; + chunk_shift_ = chunk_length; +} + +torch::IValue OnlineConvEmformerTransducerModel::StateToIValue( + const State &s) const { + return torch::ivalue::Tuple::create(s.first, s.second); +} + +OnlineConvEmformerTransducerModel::State +OnlineConvEmformerTransducerModel::StateFromIValue(torch::IValue ivalue) const { + auto tuple_ptr_states = ivalue.toTuple(); + + torch::List list_attn = + tuple_ptr_states->elements()[0].toList(); + torch::List list_conv = + tuple_ptr_states->elements()[1].toList(); + + int32_t num_layers = list_attn.size(); + + std::vector> next_state_attn; + next_state_attn.reserve(num_layers); + for (int32_t i = 0; i != num_layers; ++i) { + next_state_attn.emplace_back( + c10::impl::toTypedList(list_attn.get(i).toList()).vec()); + } + + std::vector next_state_conv; + next_state_conv.reserve(num_layers); + for (int32_t i = 0; i != num_layers; ++i) { + next_state_conv.emplace_back(list_conv.get(i).toTensor()); + } + + return {next_state_attn, next_state_conv}; +} + +torch::IValue OnlineConvEmformerTransducerModel::StackStates( + const std::vector &states) const { + int32_t batch_size = states.size(); + + // attn_caches.size() == num_layers + std::vector>> attn_caches; + // We will call torch.stack(attn_caches[i][j]) later + + // conv_caches.size() == num_layers + std::vector> conv_caches; + // we will call torch.stack(conv_caches[i]) later + int32_t num_layers = 0; + + for (auto &s : states) { + // s is a Tuple + // s[0] contains attn_caches : List[List[torch.Tensor]] + // s[1] contains conv_caches: List[torch.Tensor] + // + // len(attn_caches) == num_layers == len(conv_caches) + // + // len(attn_caches[i]) == 3 + // attn_caches[i][0] is a 2-D tensor of shape [memory_size, d_mode] + // attn_caches[i][1] and attn_caches[i][2] are 2-D tensors of shape + // [context_size, d_mode] + auto tuple_ptr = s.toTuple(); + torch::List list_attn = tuple_ptr->elements()[0].toList(); + torch::List list_conv = tuple_ptr->elements()[1].toList(); + + // attn.size() == num_layers + torch::List> attn = + c10::impl::toTypedList>(list_attn); + + torch::List conv = + c10::impl::toTypedList(list_conv); + + num_layers = attn.size(); + + if (attn_caches.empty()) { + attn_caches.resize(num_layers); + conv_caches.resize(num_layers); + } + + for (int32_t l = 0; l != num_layers; ++l) { + const torch::List &attn_l = attn[l]; + int32_t num_states_this_layer = attn_l.size(); + + auto &attn_caches_l = attn_caches[l]; + if (attn_caches_l.empty()) { + attn_caches_l.resize(num_states_this_layer); + } + + for (int32_t k = 0; k != num_states_this_layer; ++k) { + attn_caches_l[k].push_back(attn_l[k]); + } + + conv_caches[l].push_back(conv[l]); + } // for (int32_t l = 0; l != num_layers; ++l) + } // for (auto &s : states) + + std::vector> stacked_attn_caches(num_layers); + std::vector stacked_conv_caches(num_layers); + + for (int32_t l = 0; l != num_layers; ++l) { + auto &attn_caches_l = attn_caches[l]; + auto &stacked_attn_caches_l = stacked_attn_caches[l]; + for (int32_t i = 0; i != static_cast(attn_caches_l.size()); ++i) { + stacked_attn_caches_l.push_back( + torch::stack(attn_caches_l[i], /*dim*/ 1)); + } + + stacked_conv_caches[l] = torch::stack(conv_caches[l], /*dim*/ 0); + } + + return torch::ivalue::Tuple::create(stacked_attn_caches, stacked_conv_caches); +} + +std::vector OnlineConvEmformerTransducerModel::UnStackStates( + torch::IValue states) const { + TORCH_CHECK(states.isTuple(), "Expect a tuple. Given ", states.tagKind()); + + auto tuple_ptr = states.toTuple(); + torch::List list_attn = tuple_ptr->elements()[0].toList(); + torch::List list_conv = tuple_ptr->elements()[1].toList(); + + torch::List> stacked_attn = + c10::impl::toTypedList>(list_attn); + + torch::List stacked_conv = + c10::impl::toTypedList(list_conv); + + int32_t batch_size = + static_cast(stacked_conv[0]).size(0); + int32_t num_layers = stacked_conv.size(); + int32_t num_states_per_layer = + static_cast &>(stacked_attn[0]).size(); + + std::vector>> unstacked_attn( + batch_size); + + for (auto &v : unstacked_attn) { + v.resize(num_layers); + } + + std::vector> unstacked_conv(batch_size); + + for (int32_t l = 0; l != num_layers; ++l) { + const torch::List &stacked_attn_l = stacked_attn[l]; + std::vector> layer_states(num_states_per_layer); + for (int32_t k = 0; k != num_states_per_layer; ++k) { + std::vector s = + torch::unbind(stacked_attn_l[k], /*dim*/ 1); + for (int32_t b = 0; b != batch_size; ++b) { + unstacked_attn[b][l].push_back(std::move(s[b])); + } + } // for (int32_t k = 0; k != num_states_per_layer; ++k) + + auto v = torch::unbind(stacked_conv[l], /*dim*/ 0); + for (int32_t b = 0; b != batch_size; ++b) { + unstacked_conv[b].push_back(v[b]); + } + } // for (int32_t l = 0; l != num_layers; ++l) + + std::vector ans(batch_size); + for (int32_t b = 0; b != batch_size; ++b) { + ans[b] = torch::ivalue::Tuple::create(unstacked_attn[b], unstacked_conv[b]); + } + + return ans; +} + +torch::IValue OnlineConvEmformerTransducerModel::GetEncoderInitStates( + int32_t /*unused = 1*/) { + InferenceMode no_grad; + return encoder_.run_method("init_states", device_); +} + +std::tuple +OnlineConvEmformerTransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) { + InferenceMode no_grad; + + torch::IValue ivalue = encoder_.run_method("infer", features, features_length, + num_processed_frames, states); + auto tuple_ptr = ivalue.toTuple(); + torch::IValue encoder_out = tuple_ptr->elements()[0]; + + torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); + torch::IValue next_states = tuple_ptr->elements()[2]; + + auto projected_encoder_out = + encoder_proj_.run_method("forward", encoder_out).toTensor(); + + return std::make_tuple(projected_encoder_out, encoder_out_length, + next_states); +} + +torch::Tensor OnlineConvEmformerTransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + auto decoder_out = + decoder_.run_method("forward", decoder_input, /*need_pad*/ false); + + return decoder_proj_.run_method("forward", decoder_out).toTensor(); +} + +torch::Tensor OnlineConvEmformerTransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_ + .run_method("forward", encoder_out, decoder_out, + /*project_input*/ false) + .toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-conv-emformer-transducer-model.h b/sherpa/csrc/online-conv-emformer-transducer-model.h new file mode 100644 index 000000000..dd2d0fd7e --- /dev/null +++ b/sherpa/csrc/online-conv-emformer-transducer-model.h @@ -0,0 +1,91 @@ +// sherpa/csrc/online-conv-emformer-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_CONV_EMFORMER_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_CONV_EMFORMER_TRANSDUCER_MODEL_H_ + +#include +#include +#include +#include + +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { + +/** This class implements models from conv_emformer_transducer_stateless2 + * from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OnlineConvEmformerTransducerModel : public OnlineTransducerModel { + public: + /** Constructor. + * + * @param filename Path to the torchscript model. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2/export.py + * for how to export a model. + * @param device Move the model to this device on loading. + */ + explicit OnlineConvEmformerTransducerModel( + const std::string &filename, torch::Device device = torch::kCPU); + + torch::IValue StackStates( + const std::vector &states) const override; + + std::vector UnStackStates(torch::IValue states) const override; + + torch::IValue GetEncoderInitStates(int32_t unused = 1) override; + + std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) override; + + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + int32_t ContextSize() const override { return context_size_; } + + int32_t ChunkSize() const override { return chunk_size_; } + + int32_t ChunkShift() const override { return chunk_shift_; } + + // Non virtual methods that used by Python bindings. + + // See + // https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer.py#L1547 + // for what state contains for details. + using State = std::pair>, + std::vector>; + torch::IValue StateToIValue(const State &s) const; + State StateFromIValue(torch::IValue ivalue) const; + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + torch::jit::Module encoder_proj_; + torch::jit::Module decoder_proj_; + + torch::Device device_{"cpu"}; + + int32_t context_size_; + int32_t chunk_size_; + int32_t chunk_shift_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_CONV_EMFORMER_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/online-emformer-transducer-model.cc b/sherpa/csrc/online-emformer-transducer-model.cc new file mode 100644 index 000000000..8e4922f82 --- /dev/null +++ b/sherpa/csrc/online-emformer-transducer-model.cc @@ -0,0 +1,179 @@ +// sherpa/csrc/online-emformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-emformer-transducer-model.h" + +#include +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OnlineEmformerTransducerModel::OnlineEmformerTransducerModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); + + encoder_ = model_.attr("encoder").toModule(); + decoder_ = model_.attr("decoder").toModule(); + joiner_ = model_.attr("joiner").toModule(); + + context_size_ = decoder_.attr("context_size").toInt(); + + int32_t subsampling_factor = encoder_.attr("subsampling_factor").toInt(); + int32_t chunk_length = encoder_.attr("segment_length").toInt(); + int32_t right_context_length = encoder_.attr("right_context_length").toInt(); + int32_t pad_length = right_context_length + subsampling_factor - 1; + + chunk_size_ = chunk_length + pad_length; + chunk_shift_ = chunk_length; +} + +torch::IValue OnlineEmformerTransducerModel::StateToIValue( + const State &states) const { + torch::List> ans; + ans.reserve(states.size()); + for (const auto &s : states) { + ans.push_back(torch::List{s}); + } + return ans; +} + +OnlineEmformerTransducerModel::State +OnlineEmformerTransducerModel::StateFromIValue(torch::IValue ivalue) const { + torch::List list = ivalue.toList(); + + int32_t num_layers = list.size(); + State ans; + ans.reserve(num_layers); + for (int32_t i = 0; i != num_layers; ++i) { + ans.push_back( + c10::impl::toTypedList(list.get(i).toList()).vec()); + } + return ans; +} + +torch::IValue OnlineEmformerTransducerModel::StackStates( + const std::vector &ivalue) const { + int32_t batch_size = ivalue.size(); + int32_t num_layers = 0; + int32_t num_states = 0; + + // [layer][state][state_from_batch_i] + std::vector>> buf; + for (const auto &v : ivalue) { + std::vector> s = StateFromIValue(v); + num_layers = s.size(); + if (buf.empty()) { + buf.resize(num_layers); + } + + for (int32_t layer = 0; layer != num_layers; ++layer) { + const auto &layer_state = s[layer]; + num_states = layer_state.size(); + if (buf[layer].empty()) { + buf[layer].resize(num_states); + } + + for (int32_t n = 0; n != num_states; ++n) { + if (buf[layer][n].empty()) { + buf[layer][n].reserve(batch_size); + } + + buf[layer][n].push_back(layer_state[n]); + } + } + } + + State ans(num_layers); + + for (int32_t layer = 0; layer != num_layers; ++layer) { + const auto &layer_state = buf[layer]; + ans[layer].reserve(num_states); + for (const auto &s : layer_state) { + auto stacked = torch::stack(s, /*dim*/ 1); + ans[layer].push_back(stacked); + } + } + + return StateToIValue(ans); +} + +std::vector OnlineEmformerTransducerModel::UnStackStates( + torch::IValue ivalue) const { + auto states = StateFromIValue(ivalue); + int32_t num_layers = states.size(); + int32_t batch_size = states[0][0].size(1); + int32_t num_states = states[0].size(); // number of states per layer + + // [batch][layer][state] + std::vector>> buf(batch_size); + for (auto &layer : buf) { + layer.resize(num_layers); + for (auto &s : layer) { + s.reserve(num_states); + } + } + + for (int32_t layer = 0; layer != num_layers; ++layer) { + const std::vector &layer_state = states[layer]; + for (int32_t n = 0; n != num_states; ++n) { + auto unstacked_state = torch::unbind(layer_state[n], /*dim*/ 1); + for (int32_t b = 0; b != batch_size; ++b) { + buf[b][layer].push_back(std::move(unstacked_state[b])); + } + } + } + + std::vector ans(batch_size); + for (int32_t b = 0; b != batch_size; ++b) { + ans[b] = StateToIValue(buf[b]); + } + + return ans; +} + +torch::IValue OnlineEmformerTransducerModel::GetEncoderInitStates( + int32_t /*unused=1*/) { + torch::IValue ivalue = encoder_.run_method("get_init_state", device_); + // Remove batch dimension. + // Note: This is for backward compatibility + return UnStackStates(ivalue)[0]; +} + +std::tuple +OnlineEmformerTransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor & /*num_processed_frames*/, torch::IValue states) { + InferenceMode no_grad; + + torch::IValue ivalue = encoder_.run_method("streaming_forward", features, + features_length, states); + auto tuple_ptr = ivalue.toTuple(); + torch::Tensor encoder_out = tuple_ptr->elements()[0].toTensor(); + + torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); + torch::IValue next_states = tuple_ptr->elements()[2]; + + return std::make_tuple(encoder_out, encoder_out_length, next_states); +} + +torch::Tensor OnlineEmformerTransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + return decoder_.run_method("forward", decoder_input, /*need_pad*/ false) + .toTensor(); +} + +torch::Tensor OnlineEmformerTransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_.run_method("forward", encoder_out, decoder_out).toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-emformer-transducer-model.h b/sherpa/csrc/online-emformer-transducer-model.h new file mode 100644 index 000000000..a3433c7a4 --- /dev/null +++ b/sherpa/csrc/online-emformer-transducer-model.h @@ -0,0 +1,91 @@ +// sherpa/csrc/online-emformer-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_EMFORMER_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_EMFORMER_TRANSDUCER_MODEL_H_ + +#include +#include +#include + +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { +/** This class implements models from pruned_stateless_emformer_rnnt2 + * from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/emformer.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OnlineEmformerTransducerModel : public OnlineTransducerModel { + public: + /** Constructor. + * + * @param filename Path to the torchscript model. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/export.py + * for how to export a model. + * @param device Move the model to this device on loading. + */ + explicit OnlineEmformerTransducerModel(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::IValue StackStates( + const std::vector &states) const override; + + std::vector UnStackStates(torch::IValue states) const override; + + torch::IValue GetEncoderInitStates(int32_t unused = 1) override; + + std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) override; + + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + int32_t ContextSize() const override { return context_size_; } + + int32_t ChunkSize() const override { return chunk_size_; } + + int32_t ChunkShift() const override { return chunk_shift_; } + + // Non virtual methods that used by Python bindings. + + // See + // https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/emformer.py#L293 + // for what state contains for details. + // + // state[i] contains state for the i-th layer. + // state[i][k] is either a 3-d tensor of shape (T, N, C) or + // a 2-d tensor of shape (C, N) + using State = std::vector>; + + torch::IValue StateToIValue(const State &s) const; + State StateFromIValue(torch::IValue ivalue) const; + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + + torch::Device device_{"cpu"}; + + int32_t context_size_; + int32_t chunk_size_; + int32_t chunk_shift_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_EMFORMER_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/online-lstm-transducer-model.cc b/sherpa/csrc/online-lstm-transducer-model.cc new file mode 100644 index 000000000..5266117dd --- /dev/null +++ b/sherpa/csrc/online-lstm-transducer-model.cc @@ -0,0 +1,140 @@ +// sherpa/csrc/online-lstm-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-lstm-transducer-model.h" + +#include +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OnlineLstmTransducerModel::OnlineLstmTransducerModel( + const std::string &encoder_filename, const std::string &decoder_filename, + const std::string &joiner_filename, torch::Device device /*=torch::kCPU*/) + : device_(device) { + encoder_ = torch::jit::load(encoder_filename, device); + encoder_.eval(); + + decoder_ = torch::jit::load(decoder_filename, device); + encoder_.eval(); + + joiner_ = torch::jit::load(joiner_filename, device); + joiner_.eval(); + + auto conv = decoder_.attr("conv").toModule(); + + context_size_ = + conv.hasattr("weight") ? conv.attr("weight").toTensor().size(2) : 1; + + // Use 5 here since the subsampling is ((len - 3) // 2 - 1) // 2. + int32_t pad_length = 5; + + chunk_shift_ = 4; + chunk_size_ = chunk_shift_ + pad_length; +} + +torch::IValue OnlineLstmTransducerModel::StateToIValue(const State &s) const { + return torch::ivalue::Tuple::create(s.first, s.second); +} + +OnlineLstmTransducerModel::State OnlineLstmTransducerModel::StateFromIValue( + torch::IValue ivalue) const { + // ivalue is a tuple containing two tensors + auto tuple_ptr = ivalue.toTuple(); + + torch::Tensor hidden_states = tuple_ptr->elements()[0].toTensor(); + torch::Tensor cell_states = tuple_ptr->elements()[1].toTensor(); + + return {hidden_states, cell_states}; +} + +torch::IValue OnlineLstmTransducerModel::StackStates( + const std::vector &states) const { + auto n = static_cast(states.size()); + + std::vector hx; + std::vector cx; + + hx.reserve(n); + cx.reserve(n); + for (const auto &ivalue : states) { + auto s = StateFromIValue(ivalue); + hx.push_back(std::move(s.first)); + cx.push_back(std::move(s.second)); + } + + auto cat_hx = torch::cat(hx, /*dim*/ 1); + auto cat_cx = torch::cat(cx, /*dim*/ 1); + + return torch::ivalue::Tuple::create(cat_hx, cat_cx); +} + +std::vector OnlineLstmTransducerModel::UnStackStates( + torch::IValue ivalue) const { + auto states = StateFromIValue(ivalue); + + std::vector hx = states.first.unbind(/*dim*/ 1); + std::vector cx = states.second.unbind(/*dim*/ 1); + auto n = static_cast(hx.size()); + + std::vector ans(n); + for (int32_t i = 0; i != n; ++i) { + auto h = hx[i].unsqueeze(/*dim*/ 1); + auto c = cx[i].unsqueeze(/*dim*/ 1); + ans[i] = torch::ivalue::Tuple::create(h, c); + } + + return ans; +} + +torch::IValue OnlineLstmTransducerModel::GetEncoderInitStates( + int32_t batch_size /*=1*/) { + InferenceMode no_grad; + return encoder_.run_method("get_init_states", batch_size, device_); +} + +std::tuple +OnlineLstmTransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor & /*num_processed_frames*/, torch::IValue states) { + InferenceMode no_grad; + + // It returns [torch.Tensor, torch.Tensor, Pair[torch.Tensor, torch.Tensor] + // which are [encoder_out, encoder_out_len, states] + // + // We skip the second entry `encoder_out_len` since we assume the + // feature input is of fixed chunk size and there are no paddings. + // We can figure out `encoder_out_len` from `encoder_out`. + torch::IValue ivalue = + encoder_.run_method("forward", features, features_length, states); + auto tuple_ptr = ivalue.toTuple(); + torch::Tensor encoder_out = tuple_ptr->elements()[0].toTensor(); + + torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); + + auto next_states = tuple_ptr->elements()[2]; + + return std::make_tuple(encoder_out, encoder_out_length, next_states); +} + +torch::Tensor OnlineLstmTransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + return decoder_ + .run_method("forward", decoder_input, + /*need_pad*/ false) + .toTensor(); +} + +torch::Tensor OnlineLstmTransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_.run_method("forward", encoder_out, decoder_out).toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-lstm-transducer-model.h b/sherpa/csrc/online-lstm-transducer-model.h new file mode 100644 index 000000000..bfe56b91f --- /dev/null +++ b/sherpa/csrc/online-lstm-transducer-model.h @@ -0,0 +1,95 @@ +// sherpa/csrc/online-lstm-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_LSTM_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_LSTM_TRANSDUCER_MODEL_H_ +#include +#include +#include +#include + +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { +/** This class implements models from lstm_transducer_stateless{,2,3} + * from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OnlineLstmTransducerModel : public OnlineTransducerModel { + public: + /** Constructor. + * + * @param filename Path to the torchscript model. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless/export.py + * for how to export a model. + * @param device Move the model to this device on loading. + */ + explicit OnlineLstmTransducerModel(const std::string &encoder_filename, + const std::string &decoder_filename, + const std::string &joiner_filename, + torch::Device device = torch::kCPU); + + torch::IValue StackStates( + const std::vector &states) const override; + + std::vector UnStackStates(torch::IValue states) const override; + + torch::IValue GetEncoderInitStates(int32_t batch_size = 1) override; + + std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) override; + + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + int32_t ContextSize() const override { return context_size_; } + + int32_t ChunkSize() const override { return chunk_size_; } + + int32_t ChunkShift() const override { return chunk_shift_; } + + // Non virtual methods that used by Python bindings. + + // See + // https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py#L257 + // for what state contains for details. + // + // State is a tuple containing: + // - hx: (num_layers, batch_size, proj_size) + // - cx: (num_layers, batch_size, hidden_size) + // See icefall/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py + // for details + using State = std::pair; + + torch::IValue StateToIValue(const State &s) const; + State StateFromIValue(torch::IValue ivalue) const; + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + + torch::Device device_{"cpu"}; + + int32_t context_size_; + int32_t chunk_size_; + int32_t chunk_shift_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_LSTM_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/online-stream.cc b/sherpa/csrc/online-stream.cc new file mode 100644 index 000000000..05320390f --- /dev/null +++ b/sherpa/csrc/online-stream.cc @@ -0,0 +1,206 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sherpa/cpp_api/online-stream.h" + +#include +#include // NOLINT +#include +#include + +#include "kaldifeat/csrc/online-feature.h" +#include "sherpa/cpp_api/endpoint.h" +#include "sherpa/csrc/context-graph.h" +#include "sherpa/csrc/hypothesis.h" +#include "sherpa/csrc/log.h" +#include "sherpa/csrc/online-transducer-decoder.h" +#include "sherpa/csrc/resample.h" + +namespace sherpa { + +class OnlineStream::OnlineStreamImpl { + public: + explicit OnlineStreamImpl(const FeatureConfig &feat_config, + ContextGraphPtr context_graph /*=nullptr*/) + : opts_(feat_config.fbank_opts), + feat_config_(feat_config), + context_graph_(context_graph) { + fbank_ = std::make_unique(opts_); + } + + void AcceptWaveform(int32_t sampling_rate, torch::Tensor waveform) { + std::lock_guard lock(feat_mutex_); + + if (!feat_config_.normalize_samples) { + waveform.mul_(32767); + } + + if (resampler_) { + if (sampling_rate != resampler_->GetInputSamplingRate()) { + SHERPA_LOG(FATAL) << "You changed the input sampling rate!! Expected: " + << resampler_->GetInputSamplingRate() + << ", given: " << static_cast(sampling_rate); + exit(-1); + } + + waveform = resampler_->Resample(waveform, false); + fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, waveform); + return; + } + + if (sampling_rate != opts_.frame_opts.samp_freq) { + SHERPA_LOG(INFO) << "Creating a resampler:\n" + << " in_sample_rate: " << sampling_rate << "\n" + << " output_sample_rate: " + << static_cast(opts_.frame_opts.samp_freq); + + float min_freq = + std::min(sampling_rate, opts_.frame_opts.samp_freq); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler_ = std::make_unique( + sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, + lowpass_filter_width); + + waveform = resampler_->Resample(waveform, false); + fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, waveform); + return; + } + + fbank_->AcceptWaveform(sampling_rate, waveform); + } + + int32_t NumFramesReady() const { + std::lock_guard lock(feat_mutex_); + return fbank_->NumFramesReady(); + } + + bool IsLastFrame(int32_t frame) const { + std::lock_guard lock(feat_mutex_); + return fbank_->IsLastFrame(frame); + } + + void InputFinished() { + std::lock_guard lock(feat_mutex_); + fbank_->InputFinished(); + } + + torch::Tensor GetFrame(int32_t frame) { + std::lock_guard lock(feat_mutex_); + return fbank_->GetFrame(frame); + } + + torch::IValue GetState() const { return state_; } + + void SetState(torch::IValue state) { state_ = std::move(state); } + + const ContextGraphPtr &GetContextGraph() { return context_graph_; } + + void SetResult(const OnlineTransducerDecoderResult &r) { r_ = r; } + + const OnlineTransducerDecoderResult &GetResult() const { return r_; } + + int32_t &GetNumProcessedFrames() { return num_processed_frames_; } + + torch::Tensor &GetDecoderOut() { return decoder_out_; } + + int32_t &GetNumTrailingBlankFrames() { return num_trailing_blank_frames_; } + + int32_t &GetWavSegment() { return segment_; } + + int32_t &GetStartFrame() { return start_frame_; } + + private: + kaldifeat::FbankOptions opts_; + std::unique_ptr fbank_; + FeatureConfig feat_config_; + mutable std::mutex feat_mutex_; + + torch::IValue state_; + std::vector hyps_; + Hypotheses hypotheses_; + torch::Tensor decoder_out_; + int32_t num_processed_frames_ = 0; // before subsampling + int32_t num_trailing_blank_frames_ = 0; // after subsampling + /// ID of this segment + int32_t segment_ = 0; + + /// For contextual-biasing + ContextGraphPtr context_graph_; + + /// Starting frame of this segment. + int32_t start_frame_ = 0; + OnlineTransducerDecoderResult r_; + std::unique_ptr resampler_; +}; + +OnlineStream::OnlineStream(const FeatureConfig &feat_config, + ContextGraphPtr context_graph) + : impl_(std::make_unique(feat_config, context_graph)) {} + +OnlineStream::~OnlineStream() = default; + +void OnlineStream::AcceptWaveform(int32_t sampling_rate, + torch::Tensor waveform) { + impl_->AcceptWaveform(sampling_rate, waveform); +} + +int32_t OnlineStream::NumFramesReady() const { return impl_->NumFramesReady(); } + +bool OnlineStream::IsLastFrame(int32_t frame) const { + return impl_->IsLastFrame(frame); +} + +void OnlineStream::InputFinished() { impl_->InputFinished(); } + +torch::Tensor OnlineStream::GetFrame(int32_t frame) { + return impl_->GetFrame(frame); +} + +torch::IValue OnlineStream::GetState() const { return impl_->GetState(); } + +void OnlineStream::SetState(torch::IValue state) { impl_->SetState(state); } + +const ContextGraphPtr &OnlineStream::GetContextGraph() const { + return impl_->GetContextGraph(); +} + +int32_t &OnlineStream::GetNumProcessedFrames() { + return impl_->GetNumProcessedFrames(); +} + +torch::Tensor &OnlineStream::GetDecoderOut() { return impl_->GetDecoderOut(); } + +int32_t &OnlineStream::GetNumTrailingBlankFrames() { + return impl_->GetNumTrailingBlankFrames(); +} + +int32_t &OnlineStream::GetWavSegment() { return impl_->GetWavSegment(); } + +int32_t &OnlineStream::GetStartFrame() { return impl_->GetStartFrame(); } + +void OnlineStream::SetResult(const OnlineTransducerDecoderResult &r) { + impl_->SetResult(r); +} + +const OnlineTransducerDecoderResult &OnlineStream::GetResult() const { + return impl_->GetResult(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-transducer-decoder.h b/sherpa/csrc/online-transducer-decoder.h new file mode 100644 index 000000000..370738ff3 --- /dev/null +++ b/sherpa/csrc/online-transducer-decoder.h @@ -0,0 +1,84 @@ +// sherpa/csrc/online-transducer-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation + +#ifndef SHERPA_CSRC_ONLINE_TRANSDUCER_DECODER_H_ +#define SHERPA_CSRC_ONLINE_TRANSDUCER_DECODER_H_ + +#include + +#include "k2/torch_api.h" +#include "sherpa/cpp_api/online-stream.h" +#include "sherpa/csrc/hypothesis.h" +#include "torch/script.h" + +namespace sherpa { + +struct OnlineTransducerDecoderResult { + /// Number of frames we have decoded so far + int32_t frame_offset = 0; + + /// number of trailing blank frames decoded so far + int32_t num_trailing_blanks = 0; + + /// The decoded token IDs so far + std::vector tokens; + + /// timestamps[i] contains the output frame index where tokens[i] is decoded. + std::vector timestamps; + + // used only for modified_beam_search + Hypotheses hyps; + + // used only for fast_beam_search + k2::RnntStreamPtr rnnt_stream; + + // Before subsampling. Used only for fast_beam_search + int32_t num_processed_frames = 0; +}; + +class OnlineTransducerDecoder { + public: + virtual ~OnlineTransducerDecoder() = default; + + /* Return an empty result. + * + * To simplify the decoding code, we add `context_size` blanks + * to the beginning of the decoding result, which will be + * stripped by calling `StripPrecedingBlanks()`. + */ + virtual OnlineTransducerDecoderResult GetEmptyResult() = 0; + + /** Strip blanks added by `GetEmptyResult()`. */ + virtual void StripLeadingBlanks(OnlineTransducerDecoderResult * /*r*/) {} + + /* Finalize the context graph searching, it will subtract the bonus of + * partial matching hypothesis. + * + * Used only in modified_beam_search and when context_graph is given. + */ + virtual void FinalizeResult(OnlineStream * /*s*/, + OnlineTransducerDecoderResult * /*r*/) {} + + /** Run transducer beam search given the output from the encoder model. + * + * @param encoder_out A 3-D tensor of shape (N, T, joiner_dim) + * + * @note This is no need to pass encoder_out_length here since for the + * online decoding case, each utterance has the same number of frames + * and there are no paddings. + * + * @return Return a vector of size `N` containing the decoded results. + */ + virtual void Decode(torch::Tensor encoder_out, + std::vector *result) = 0; + + virtual void Decode(torch::Tensor encoder_out, OnlineStream **ss, + int32_t num_streams, + std::vector *result) { + SHERPA_LOG(FATAL) << "This interface is for ModifiedBeamSearchDecoder."; + } +}; +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_TRANSDUCER_DECODER_H_ diff --git a/sherpa/csrc/online-transducer-fast-beam-search-decoder.cc b/sherpa/csrc/online-transducer-fast-beam-search-decoder.cc new file mode 100644 index 000000000..5482ffcbc --- /dev/null +++ b/sherpa/csrc/online-transducer-fast-beam-search-decoder.cc @@ -0,0 +1,139 @@ +// sherpa/csrc/online-transducer-fast-beam-search-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-transducer-fast-beam-search-decoder.h" + +#include + +#include "k2/torch_api.h" +#include "sherpa/csrc/online-transducer-decoder.h" + +namespace sherpa { + +OnlineTransducerFastBeamSearchDecoder::OnlineTransducerFastBeamSearchDecoder( + OnlineTransducerModel *model, const FastBeamSearchConfig &config) + : model_(model), config_(config), vocab_size_(model->VocabSize()) { + if (config.lg.empty()) { + // Use a trivial graph + decoding_graph_ = k2::GetTrivialGraph(vocab_size_ - 1, model_->Device()); + } else { + decoding_graph_ = k2::LoadFsaClass(config.lg, model_->Device()); + k2::ScaleTensorAttribute(decoding_graph_, config.ngram_lm_scale, "scores"); + } +} + +OnlineTransducerDecoderResult +OnlineTransducerFastBeamSearchDecoder::GetEmptyResult() { + OnlineTransducerDecoderResult r; + r.rnnt_stream = k2::CreateRnntStream(decoding_graph_); + return r; +} + +void OnlineTransducerFastBeamSearchDecoder::Decode( + torch::Tensor encoder_out, + std::vector *results) { + TORCH_CHECK(encoder_out.dim() == 3, encoder_out.dim(), " vs ", 3); + + TORCH_CHECK(encoder_out.size(0) == static_cast(results->size()), + encoder_out.size(0), " vs ", results->size()); + + auto device = model_->Device(); + int32_t context_size = model_->ContextSize(); + + std::vector stream_vec; + std::vector num_processed_frames_vec; + + stream_vec.reserve(results->size()); + num_processed_frames_vec.reserve(results->size()); + + for (auto &r : *results) { + stream_vec.push_back(r.rnnt_stream); + + // number of frames before subsampling + num_processed_frames_vec.push_back(r.num_processed_frames); + } + + torch::Tensor num_processed_frames = + torch::tensor(num_processed_frames_vec, torch::kInt); + + k2::RnntStreamsPtr streams = + k2::CreateRnntStreams(stream_vec, vocab_size_, context_size, config_.beam, + config_.max_contexts, config_.max_states); + + int32_t N = encoder_out.size(0); + int32_t T = encoder_out.size(1); + k2::RaggedShapePtr shape; + torch::Tensor contexts; + for (int32_t t = 0; t != T; ++t) { + std::tie(shape, contexts) = k2::GetRnntContexts(streams); + contexts = contexts.to(torch::kLong); + // contexts.shape: (num_hyps, context_size) + + auto decoder_out = model_->RunDecoder(contexts).squeeze(1); + // decoder_out.shape: (num_hyps, joiner_dim) + + auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), t}); + // cur_encoder_out has shape (N, joiner_dim) + + auto index = k2::RowIds(shape, 1).to(torch::kLong).to(device); + cur_encoder_out = cur_encoder_out.index_select(/*dim*/ 0, /*index*/ index); + // cur_encoder_out has shape (num_hyps, joiner_dim) + + auto logits = model_->RunJoiner(cur_encoder_out, decoder_out); + // logits.shape: (num_hyps, vocab_size) + + auto log_probs = logits.log_softmax(-1); + k2::AdvanceRnntStreams(streams, log_probs); + } // for (int32_t t = 0; t != T; ++t) + + k2::TerminateAndFlushRnntStreams(streams); + + // TODO(fangjun): This assumes the subsampling factor is 4 + num_processed_frames = (num_processed_frames / 4).to(torch::kInt) + T; + + std::vector processed_frames_vec( + num_processed_frames.data_ptr(), + num_processed_frames.data_ptr() + num_processed_frames.numel()); + + auto lattice = + k2::FormatOutput(streams, processed_frames_vec, config_.allow_partial); + + lattice = k2::ShortestPath(lattice); + + // Get tokens and timestamps from the lattice + auto labels = k2::GetTensorAttr(lattice, "labels").cpu().contiguous(); + auto acc = labels.accessor(); + + for (auto &r : *results) { + r.tokens.clear(); + r.timestamps.clear(); + r.num_trailing_blanks = 0; + } + OnlineTransducerDecoderResult *p = results->data(); + + for (int32_t i = 0, t = 0; i != labels.numel(); ++i) { + int32_t token = acc[i]; + + if (token == -1) { + // end of this utterance. + t = 0; + ++p; + + continue; + } + + if (token == 0) { + ++t; + ++p->num_trailing_blanks; + continue; + } + + p->num_trailing_blanks = 0; + p->tokens.push_back(token); + p->timestamps.push_back(t); + ++t; + } // for (int32_t i = 0, t = 0; i != labels.numel(); ++i) +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-transducer-fast-beam-search-decoder.h b/sherpa/csrc/online-transducer-fast-beam-search-decoder.h new file mode 100644 index 000000000..c9eb41830 --- /dev/null +++ b/sherpa/csrc/online-transducer-fast-beam-search-decoder.h @@ -0,0 +1,40 @@ +// sherpa/csrc/online-transducer-fast-beam-search-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_TRANSDUCER_FAST_BEAM_SEARCH_DECODER_H_ +#define SHERPA_CSRC_ONLINE_TRANSDUCER_FAST_BEAM_SEARCH_DECODER_H_ + +#include + +#include "k2/torch_api.h" +#include "sherpa/cpp_api/fast-beam-search-config.h" +#include "sherpa/csrc/online-transducer-decoder.h" +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { + +class OnlineTransducerFastBeamSearchDecoder : public OnlineTransducerDecoder { + public: + /** + * @param config + */ + OnlineTransducerFastBeamSearchDecoder(OnlineTransducerModel *model, + const FastBeamSearchConfig &config); + + /* Return an empty result. */ + OnlineTransducerDecoderResult GetEmptyResult() override; + + void Decode(torch::Tensor encoder_out, + std::vector *result) override; + + private: + OnlineTransducerModel *model_; // Not owned + k2::FsaClassPtr decoding_graph_; + + FastBeamSearchConfig config_; + int32_t vocab_size_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_TRANSDUCER_FAST_BEAM_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/online-transducer-greedy-search-decoder.cc b/sherpa/csrc/online-transducer-greedy-search-decoder.cc new file mode 100644 index 000000000..a274bbd9e --- /dev/null +++ b/sherpa/csrc/online-transducer-greedy-search-decoder.cc @@ -0,0 +1,107 @@ +// sherpa/csrc/online-transducer-greedy-search-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-transducer-greedy-search-decoder.h" + +#include +#include + +namespace sherpa { + +static void BuildDecoderInput( + const std::vector &r, + torch::Tensor *decoder_input) { + int32_t batch_size = decoder_input->size(0); + int32_t context_size = decoder_input->size(1); + int64_t *p = decoder_input->data_ptr(); + for (int32_t i = 0; i != batch_size; ++i) { + auto start = r[i].tokens.end() - context_size; + auto end = r[i].tokens.end(); + std::copy(start, end, p); + p += context_size; + } +} + +OnlineTransducerDecoderResult +OnlineTransducerGreedySearchDecoder::GetEmptyResult() { + int32_t context_size = model_->ContextSize(); + int32_t blank_id = 0; // always 0 + OnlineTransducerDecoderResult r; + r.tokens.resize(context_size, -1); + r.tokens.back() = blank_id; + + return r; +} + +void OnlineTransducerGreedySearchDecoder::StripLeadingBlanks( + OnlineTransducerDecoderResult *r) { + int32_t context_size = model_->ContextSize(); + + auto start = r->tokens.begin() + context_size; + auto end = r->tokens.end(); + + r->tokens = std::vector(start, end); +} + +void OnlineTransducerGreedySearchDecoder::Decode( + torch::Tensor encoder_out, + std::vector *results) { + TORCH_CHECK(encoder_out.dim() == 3, encoder_out.dim(), " vs ", 3); + + TORCH_CHECK(encoder_out.size(0) == static_cast(results->size()), + encoder_out.size(0), " vs ", results->size()); + + auto device = model_->Device(); + int32_t blank_id = 0; // always 0 + int32_t context_size = model_->ContextSize(); + + int32_t N = encoder_out.size(0); + int32_t T = encoder_out.size(1); + + auto decoder_input = torch::empty( + {N, context_size}, torch::dtype(torch::kLong) + .memory_format(torch::MemoryFormat::Contiguous)); + BuildDecoderInput(*results, &decoder_input); + + auto decoder_out = model_->RunDecoder(decoder_input.to(device)).squeeze(1); + // decoder_out has shape (N, joiner_dim) + + for (int32_t t = 0; t != T; ++t) { + auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), t}); + // cur_encoder_out has shape (N, joiner_dim) + + auto logits = model_->RunJoiner(cur_encoder_out, decoder_out); + // logits has shape (N, vocab_size) + + auto max_indices = logits.argmax(/*dim*/ -1).cpu(); + auto max_indices_accessor = max_indices.accessor(); + bool emitted = false; + for (int32_t n = 0; n != N; ++n) { + auto index = max_indices_accessor[n]; + auto &r = (*results)[n]; + if (index != blank_id) { + emitted = true; + + r.tokens.push_back(index); + r.timestamps.push_back(t + r.frame_offset); + r.num_trailing_blanks = 0; + } else { + ++r.num_trailing_blanks; + } + } + + if (emitted) { + BuildDecoderInput(*results, &decoder_input); + decoder_out = model_->RunDecoder(decoder_input.to(device)).squeeze(1); + // decoder_out has shape (N, joiner_dim) + } + } // for (int32_t t = 0; t != T; ++t) + + // Update frame_offset + for (auto &r : *results) { + r.frame_offset += T; + } +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-transducer-greedy-search-decoder.h b/sherpa/csrc/online-transducer-greedy-search-decoder.h new file mode 100644 index 000000000..5a8715449 --- /dev/null +++ b/sherpa/csrc/online-transducer-greedy-search-decoder.h @@ -0,0 +1,32 @@ +// sherpa/csrc/online-transducer-greedy-search-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_ +#define SHERPA_CSRC_ONLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_ + +#include + +#include "sherpa/csrc/online-transducer-decoder.h" +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { + +class OnlineTransducerGreedySearchDecoder : public OnlineTransducerDecoder { + public: + explicit OnlineTransducerGreedySearchDecoder(OnlineTransducerModel *model) + : model_(model) {} + + OnlineTransducerDecoderResult GetEmptyResult() override; + + void StripLeadingBlanks(OnlineTransducerDecoderResult *r) override; + + void Decode(torch::Tensor encoder_out, + std::vector *result) override; + + private: + OnlineTransducerModel *model_; // Not owned +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/online-transducer-model.h b/sherpa/csrc/online-transducer-model.h new file mode 100644 index 000000000..7e93915e6 --- /dev/null +++ b/sherpa/csrc/online-transducer-model.h @@ -0,0 +1,159 @@ +// sherpa/csrc/online-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_TRANSDUCER_MODEL_H_ + +#include +#include + +#include "torch/script.h" + +namespace sherpa { + +class OnlineTransducerModel { + public: + virtual ~OnlineTransducerModel() = default; + + /** Stack a list of individual states into a batch. + * + * It is the inverse operation of `UnStackStates`. + * + * @param states states[i] contains the state for the i-th utterance. + * @return Return a single value representing the batched state. + */ + virtual torch::IValue StackStates( + const std::vector &states) const = 0; + + /** Unstack a batch state into a list of individual states. + * + * It is the inverse operation of `StackStates`. + * + * @param states A batched state. + * @return ans[i] contains the state for the i-th utterance. + */ + virtual std::vector UnStackStates( + torch::IValue states) const = 0; + + /** Get the initial encoder states. + * + * @param unused A placeholder. Some models, e.g., ConvEmformer uses it, will + * other models won't use it. + * @return Return the initial encoder state. + */ + virtual torch::IValue GetEncoderInitStates(int32_t unused = 1) = 0; + + /** Run the encoder. + * + * @param features A tensor of shape (N, T, C). + * @param features_length A tensor of shape (N,) containing the number + * of valid frames in `features` before padding. + * @param num_processed_frames Number of processed frames so far before + * subsampling. + * @param states Encoder state of the previous chunk. + * + * @return Return a tuple containing: + * - encoder_out, a tensor of shape (N, T', encoder_out_dim) + * - encoder_out_lens, a tensor of shape (N,) + * - next_states Encoder state for the next chunk. + */ + virtual std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) = 0; + + /** Run the decoder network. + * + * Caution: We assume there are no recurrent connections in the decoder and + * the decoder is stateless. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless2/decoder.py + * for an example + * + * @param decoder_input It is usually of shape (N, context_size) + * @return Return a tensor of shape (N, 1, decoder_dim). + */ + virtual torch::Tensor RunDecoder(const torch::Tensor &decoder_input) = 0; + + /** Run the joint network. + * + * @param encoder_out Output of the encoder network. A tensor of shape + * (N, encoder_dim). + * @param decoder_out Output of the decoder network. A tensor of shape + * (N, decoder_dim). + * @return Return a tensor of shape (N, vocab_size). In icefall, the last + * last layer of the joint network is `nn.Linear`, + * not `nn.LogSoftmax`. + */ + virtual torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) = 0; + + /** Return the device where computation takes place. + * + * Note: We don't support moving the model to a different device + * after construction. + */ + virtual torch::Device Device() const = 0; + + /** If we are using a stateless decoder and if it contains a + * Conv1D, this function returns the kernel size of the convolution layer. + */ + virtual int32_t ContextSize() const = 0; + + /** We send this number of feature frames to the encoder at a time. */ + virtual int32_t ChunkSize() const = 0; + + /** Number of input frames to discard after each call to RunEncoder. + * + * For instance, if we have 30 frames, chunk_size=8, chunk_shift=6. + * + * In the first call of RunEncoder, we use frames 0~7 since chunk_size is 8. + * Then we discard frame 0~5 since chunk_shift is 6. + * In the second call of RunEncoder, we use frames 6~13; and then we discard + * frames 6~11. + * In the third call of RunEncoder, we use frames 12~19; and then we discard + * frames 12~16. + * + * Note: ChunkSize() - ChunkShift() == right context size + */ + virtual int32_t ChunkShift() const = 0; + + int32_t VocabSize() const { return vocab_size_; } + + int32_t SubsamplingFactor() const { return 4; } + + void WarmUp(torch::Tensor features, torch::Tensor features_length) { + torch::IValue states = GetEncoderInitStates(); + states = StackStates({states}); + torch::Tensor num_processed_frames = torch::zeros_like(features_length); + + torch::Tensor encoder_out; + torch::Tensor encoder_out_length; + torch::IValue next_states; + + std::tie(encoder_out, encoder_out_length, next_states) = + RunEncoder(features, features_length, num_processed_frames, states); + // encoder_out.shape: (N, T, joiner_dim) + // + auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), 0}); + // cur_encoder_out.shape (N, joiner_dim) + + torch::Tensor decoder_input = + torch::zeros({features_length.size(0), ContextSize()}, torch::kLong) + .to(Device()); + // decoder_input.shape (N, context_size) + + auto decoder_out = RunDecoder(decoder_input).squeeze(1); + // decoder_out.shape (N, joiner_dim) + + auto logits = RunJoiner(cur_encoder_out, decoder_out); + // logits.shape (N, vocab_size) + + vocab_size_ = logits.size(-1); + } + + private: + int32_t vocab_size_ = -1; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/online-transducer-modified-beam-search-decoder.cc b/sherpa/csrc/online-transducer-modified-beam-search-decoder.cc new file mode 100644 index 000000000..59b14f0ef --- /dev/null +++ b/sherpa/csrc/online-transducer-modified-beam-search-decoder.cc @@ -0,0 +1,243 @@ +// sherpa/csrc/online-transducer-modified-beam-search-decoder.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/csrc/online-transducer-modified-beam-search-decoder.h" + +#include +#include + +#include "k2/torch_api.h" + +namespace sherpa { + +static torch::Tensor FloorDivide(torch::Tensor a, int32_t b) { +#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ + (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR > 7) + return torch::div(a, b, /*rounding_mode*/ "trunc"); +#else + return torch::floor_divide(a, b); +#endif +} + +static torch::Tensor BuildDecoderInput(const std::vector &hyps, + int32_t context_size) { + int32_t num_hyps = hyps.size(); + torch::Tensor decoder_input = + torch::empty({num_hyps, context_size}, + torch::dtype(torch::kLong) + .memory_format(torch::MemoryFormat::Contiguous)); + + int64_t *p = decoder_input.data_ptr(); + for (const auto &h : hyps) { + auto start = h.ys.end() - context_size; + auto end = h.ys.end(); + + std::copy(start, end, p); + p += context_size; + } + + return decoder_input; +} + +/** Return a ragged shape with axes [utt][num_hyps]. + * + * @param hyps hyps.size() == batch_size. Each entry contains the active + * hypotheses of an utterance. + * @return Return a ragged shape with 2 axes [utt][num_hyps]. Note that the + * shape is on CPU. + */ +static k2::RaggedShapePtr GetHypsShape(const std::vector &hyps) { + int32_t num_utt = hyps.size(); + torch::Tensor row_splits = torch::empty( + {num_utt + 1}, + torch::dtype(torch::kInt).memory_format(torch::MemoryFormat::Contiguous)); + auto row_splits_acc = row_splits.accessor(); + for (int32_t i = 0; i != num_utt; ++i) { + row_splits_acc[i] = hyps[i].Size(); + } + + k2::ExclusiveSum(row_splits, &row_splits); + + return k2::RaggedShape2(row_splits, torch::Tensor(), row_splits_acc[num_utt]); +} + +OnlineTransducerDecoderResult +OnlineTransducerModifiedBeamSearchDecoder::GetEmptyResult() { + int32_t context_size = model_->ContextSize(); + int32_t blank_id = 0; // always 0 + // + std::vector blanks(context_size, -1); + blanks.back() = blank_id; + + Hypotheses blank_hyp({{blanks, 0}}); + + OnlineTransducerDecoderResult r; + r.hyps = std::move(blank_hyp); + + return r; +} + +void OnlineTransducerModifiedBeamSearchDecoder::StripLeadingBlanks( + OnlineTransducerDecoderResult *r) { + int32_t context_size = model_->ContextSize(); + auto hyp = r->hyps.GetMostProbable(true); + + auto start = hyp.ys.begin() + context_size; + auto end = hyp.ys.end(); + + r->tokens = std::vector(start, end); + r->timestamps = std::move(hyp.timestamps); + r->num_trailing_blanks = hyp.num_trailing_blanks; +} + +void OnlineTransducerModifiedBeamSearchDecoder::FinalizeResult( + OnlineStream *s, OnlineTransducerDecoderResult *r) { + if (nullptr != s->GetContextGraph()) { + for (auto iter = r->hyps.begin(); iter != r->hyps.end(); ++iter) { + auto context_res = + s->GetContextGraph()->Finalize(iter->second.context_state); + iter->second.log_prob += context_res.first; + iter->second.context_state = context_res.second; + } + } +} + +void OnlineTransducerModifiedBeamSearchDecoder::Decode( + torch::Tensor encoder_out, + std::vector *results) { + Decode(encoder_out, nullptr, 0, results); +} + +void OnlineTransducerModifiedBeamSearchDecoder::Decode( + torch::Tensor encoder_out, OnlineStream **ss, int32_t num_streams, + std::vector *results) { + TORCH_CHECK(encoder_out.dim() == 3, encoder_out.dim(), " vs ", 3); + + TORCH_CHECK(encoder_out.size(0) == static_cast(results->size()), + encoder_out.size(0), " vs ", results->size()); + + auto device = model_->Device(); + int32_t blank_id = 0; // always 0 + int32_t context_size = model_->ContextSize(); + + int32_t N = encoder_out.size(0); + int32_t T = encoder_out.size(1); + + if (ss) { + SHERPA_CHECK_EQ(N, num_streams); + } + + std::vector cur; + cur.reserve(N); + + for (auto &r : *results) { + cur.push_back(std::move(r.hyps)); + } + + std::vector prev; + + for (int32_t t = 0; t != T; ++t) { + auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), t}); + // cur_encoder_out has shape (N, joiner_dim) + + // Due to merging paths with identical token sequences, + // not all utterances have "num_active_paths" paths. + auto hyps_shape = GetHypsShape(cur); + int32_t num_hyps = k2::TotSize(hyps_shape, 1); + + prev.clear(); + prev.reserve(num_hyps); + for (auto &hyps : cur) { + for (auto &h : hyps) { + prev.push_back(std::move(h.second)); + } + } + cur.clear(); + cur.reserve(N); + + auto ys_log_probs = torch::empty({num_hyps, 1}, torch::kFloat); + + auto ys_log_probs_acc = ys_log_probs.accessor(); + for (int32_t k = 0; k != num_hyps; ++k) { + ys_log_probs_acc[k][0] = prev[k].log_prob; + } + + auto decoder_input = BuildDecoderInput(prev, context_size).to(device); + auto decoder_out = model_->RunDecoder(decoder_input).squeeze(1); + // decoder_out is of shape (num_hyps, joiner_dim) + + auto index = k2::RowIds(hyps_shape, 1).to(torch::kLong).to(device); + cur_encoder_out = cur_encoder_out.index_select(/*dim*/ 0, /*index*/ index); + // cur_encoder_out is of shape (num_hyps, joiner_dim) + + auto logits = model_->RunJoiner(cur_encoder_out, decoder_out); + // logits has shape (num_hyps, vocab_size) + + auto log_probs = (logits / temperature_).log_softmax(-1).cpu(); + + log_probs.add_(ys_log_probs); + + int32_t vocab_size = log_probs.size(1); + log_probs = log_probs.reshape(-1); + auto row_splits = k2::RowSplits(hyps_shape, 1); + auto row_splits_acc = row_splits.accessor(); + + for (int32_t k = 0; k != N; ++k) { + int32_t frame_offset = (*results)[k].frame_offset; + + int32_t start = row_splits_acc[k]; + int32_t end = row_splits_acc[k + 1]; + + torch::Tensor values, indexes; + std::tie(values, indexes) = + log_probs.slice(/*dim*/ 0, start * vocab_size, end * vocab_size) + .topk(/*k*/ num_active_paths_, /*dim*/ 0, + /*largest*/ true, /*sorted*/ true); + + auto topk_hyp_indexes = FloorDivide(indexes, vocab_size); + auto topk_token_indexes = torch::remainder(indexes, vocab_size); + + auto values_acc = values.accessor(); + auto topk_hyp_indexes_acc = topk_hyp_indexes.accessor(); + auto topk_token_indexes_acc = topk_token_indexes.accessor(); + + Hypotheses hyps; + for (int32_t j = 0; j != values.numel(); ++j) { + int32_t hyp_idx = topk_hyp_indexes_acc[j]; + Hypothesis new_hyp = prev[start + hyp_idx]; // note: hyp_idx is 0 based + + int32_t new_token = topk_token_indexes_acc[j]; + + float context_score = 0; + auto context_state = new_hyp.context_state; + + if (new_token != blank_id) { + new_hyp.ys.push_back(new_token); + new_hyp.timestamps.push_back(t + frame_offset); + new_hyp.num_trailing_blanks = 0; + if (ss != nullptr && ss[k]->GetContextGraph() != nullptr) { + auto context_res = ss[k]->GetContextGraph()->ForwardOneStep( + context_state, new_token); + context_score = context_res.first; + new_hyp.context_state = context_res.second; + } + } else { + new_hyp.num_trailing_blanks += 1; + } + + // We already added log_prob of the path to log_probs before, so + // we use values_acc[j] here directly. + new_hyp.log_prob = values_acc[j] + context_score; + hyps.Add(std::move(new_hyp)); + } + cur.push_back(std::move(hyps)); + } // for (int32_t k = 0; k != N; ++k) + } // for (int32_t t = 0; t != T; ++t) + + for (int32_t i = 0; i != N; ++i) { + (*results)[i].hyps = std::move(cur[i]); + (*results)[i].frame_offset += T; + } +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-transducer-modified-beam-search-decoder.h b/sherpa/csrc/online-transducer-modified-beam-search-decoder.h new file mode 100644 index 000000000..3c6da975e --- /dev/null +++ b/sherpa/csrc/online-transducer-modified-beam-search-decoder.h @@ -0,0 +1,43 @@ +// sherpa/csrc/online-transducer-modified-beam-search-decoder.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_ +#define SHERPA_CSRC_ONLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_ + +#include + +#include "sherpa/csrc/online-transducer-decoder.h" +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { + +class OnlineTransducerModifiedBeamSearchDecoder + : public OnlineTransducerDecoder { + public: + explicit OnlineTransducerModifiedBeamSearchDecoder( + OnlineTransducerModel *model, int32_t num_active_paths, float temperature) + : model_(model), + num_active_paths_(num_active_paths), + temperature_(temperature) {} + + OnlineTransducerDecoderResult GetEmptyResult() override; + + void StripLeadingBlanks(OnlineTransducerDecoderResult *r) override; + + void FinalizeResult(OnlineStream *s, + OnlineTransducerDecoderResult *r) override; + + void Decode(torch::Tensor encoder_out, + std::vector *result) override; + + void Decode(torch::Tensor encoder_out, OnlineStream **ss, int32_t num_streams, + std::vector *result) override; + + private: + OnlineTransducerModel *model_; // Not owned + int32_t num_active_paths_; + float temperature_ = 1.0; +}; + +} // namespace sherpa +#endif // SHERPA_CSRC_ONLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_ diff --git a/sherpa/csrc/online-zipformer-transducer-model.cc b/sherpa/csrc/online-zipformer-transducer-model.cc new file mode 100644 index 000000000..2cbf971bd --- /dev/null +++ b/sherpa/csrc/online-zipformer-transducer-model.cc @@ -0,0 +1,276 @@ +// sherpa/csrc/online-zipformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-zipformer-transducer-model.h" + +#include +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OnlineZipformerTransducerModel::OnlineZipformerTransducerModel( + const std::string &encoder_filename, const std::string &decoder_filename, + const std::string &joiner_filename, torch::Device device /*=torch::kCPU*/) + : device_(device) { + encoder_ = torch::jit::load(encoder_filename, device); + encoder_.eval(); + + decoder_ = torch::jit::load(decoder_filename, device); + encoder_.eval(); + + joiner_ = torch::jit::load(joiner_filename, device); + joiner_.eval(); + + auto conv = decoder_.attr("conv").toModule(); + + context_size_ = + conv.hasattr("weight") ? conv.attr("weight").toTensor().size(2) : 1; + + // Use 7 here since the subsampling is ((len - 7) // 2 + 1) // 2. + int32_t pad_length = 7; + chunk_shift_ = encoder_.attr("decode_chunk_size").toInt() * 2; + chunk_size_ = chunk_shift_ + pad_length; + + from_torch_jit_trace_ = true; +} + +OnlineZipformerTransducerModel::OnlineZipformerTransducerModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); + + encoder_ = model_.attr("encoder").toModule(); + decoder_ = model_.attr("decoder").toModule(); + joiner_ = model_.attr("joiner").toModule(); + + auto conv = decoder_.attr("conv").toModule(); + + context_size_ = + conv.hasattr("weight") ? conv.attr("weight").toTensor().size(2) : 1; + + // Use 7 here since the subsampling is ((len - 7) // 2 + 1) // 2. + int32_t pad_length = 7; + + chunk_shift_ = encoder_.attr("decode_chunk_size").toInt() * 2; + chunk_size_ = chunk_shift_ + pad_length; + + from_torch_jit_trace_ = false; +} + +torch::IValue OnlineZipformerTransducerModel::StackStates( + const std::vector &states) const { + int32_t batch_size = states.size(); + + // mod_states.size() == num_elements == 7 * num_encoders + // mod_states[i].size() == batch_size + std::vector> mod_states; + int32_t num_elements = 0; + + for (auto &s : states) { + torch::List s_list = + c10::impl::toTypedList(s.toList()); + + num_elements = s_list.size(); + if (mod_states.empty()) { + mod_states.resize(num_elements); + } + + for (int32_t i = 0; i != num_elements; ++i) { + mod_states[i].push_back(s_list[i]); + } + } + + int32_t num_encoders = num_elements / 7; + std::vector stacked_states(num_elements); + + for (int32_t i = 0; i != num_encoders; ++i) { + // cached_len: (num_layers, batch_size) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 1); + } + + for (int32_t i = num_encoders; i != 2 * num_encoders; ++i) { + // cached_avg: (num_layers, batch_size, D) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 1); + } + + for (int32_t i = 2 * num_encoders; i != 3 * num_encoders; ++i) { + // cached_key: (num_layers, left_context_size, batch_size, D) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 2); + } + + for (int32_t i = 3 * num_encoders; i != 4 * num_encoders; ++i) { + // cached_val: (num_layers, left_context_size, batch_size, D) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 2); + } + + for (int32_t i = 4 * num_encoders; i != 5 * num_encoders; ++i) { + // cached_val2: (num_layers, left_context_size, batch_size, D) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 2); + } + + for (int32_t i = 5 * num_encoders; i != 6 * num_encoders; ++i) { + // cached_conv1: (num_layers, batch_size, D, kernel-1) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 1); + } + + for (int32_t i = 6 * num_encoders; i != 7 * num_encoders; ++i) { + // cached_conv2: (num_layers, batch_size, D, kernel-1) + stacked_states[i] = torch::cat(mod_states[i], /*dim*/ 1); + } + + return stacked_states; +} + +std::vector OnlineZipformerTransducerModel::UnStackStates( + torch::IValue ivalue) const { + // ivalue is a list + auto list_ptr = ivalue.toList(); + int32_t num_elements = list_ptr.size(); + + // states.size() == num_elements = 7 * num_encoders + std::vector states; + states.reserve(num_elements); + for (int32_t i = 0; i != num_elements; ++i) { + states.emplace_back(list_ptr.get(i).toTensor()); + } + + int32_t num_encoders = num_elements / 7; + int32_t batch_size = states[0].size(1); + + // unstacked_states.size() == batch_size + // unstacked_states[n].size() == num_elements + std::vector> unstacked_states(batch_size); + + for (int32_t i = 0; i != num_encoders; ++i) { + // cached_len: (num_layers, batch_size) + std::vector cached_len = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 1); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_len[n]); + } + } + + for (int32_t i = num_encoders; i != 2 * num_encoders; ++i) { + // cached_avg: (num_layers, batch_size, D) + std::vector cached_avg = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 1); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_avg[n]); + } + } + + for (int32_t i = 2 * num_encoders; i != 3 * num_encoders; ++i) { + // cached_key: (num_layers, left_context_size, batch_size, D) + std::vector cached_key = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 2); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_key[n]); + } + } + + for (int32_t i = 3 * num_encoders; i != 4 * num_encoders; ++i) { + // cached_val: (num_layers, left_context_size, batch_size, D) + std::vector cached_val = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 2); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_val[n]); + } + } + + for (int32_t i = 4 * num_encoders; i != 5 * num_encoders; ++i) { + // cached_val2: (num_layers, left_context_size, batch_size, D) + std::vector cached_val2 = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 2); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_val2[n]); + } + } + + for (int32_t i = 5 * num_encoders; i != 6 * num_encoders; ++i) { + // cached_conv1: (num_layers, batch_size, D, kernel-1) + std::vector cached_conv1 = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 1); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_conv1[n]); + } + } + + for (int32_t i = 6 * num_encoders; i != 7 * num_encoders; ++i) { + // cached_conv2: (num_layers, batch_size, D, kernel-1) + std::vector cached_conv2 = + torch::chunk(states[i], /*chunks*/ batch_size, /*dim*/ 1); + for (int32_t n = 0; n != batch_size; ++n) { + unstacked_states[n].push_back(cached_conv2[n]); + } + } + + std::vector ans(batch_size); + for (int32_t n = 0; n != batch_size; ++n) { + // unstacked_states[n] is std::vector + ans[n] = unstacked_states[n]; + } + + return ans; +} + +torch::IValue OnlineZipformerTransducerModel::GetEncoderInitStates( + int32_t batch_size /*=1*/) { + InferenceMode no_grad; + return encoder_.run_method("get_init_state", device_); +} + +std::tuple +OnlineZipformerTransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) { + InferenceMode no_grad; + + // It returns [torch.Tensor, torch.Tensor, Pair[torch.Tensor, torch.Tensor] + // which are [encoder_out, encoder_out_len, states] + // + // We skip the second entry `encoder_out_len` since we assume the + // feature input is of fixed chunk size and there are no paddings. + // We can figure out `encoder_out_len` from `encoder_out`. + torch::List s_list = + c10::impl::toTypedList(states.toList()); + torch::IValue ivalue = + encoder_.run_method("forward", features, features_length, states); + auto tuple_ptr = ivalue.toTuple(); + torch::Tensor encoder_out = tuple_ptr->elements()[0].toTensor(); + + torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); + + auto next_states = tuple_ptr->elements()[2]; + + return std::make_tuple(encoder_out, encoder_out_length, next_states); +} + +torch::Tensor OnlineZipformerTransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + if (from_torch_jit_trace_) { + return decoder_ + .run_method("forward", decoder_input, + /*need_pad*/ torch::tensor({0}).to(torch::kBool)) + .toTensor(); + } else { + return decoder_ + .run_method("forward", decoder_input, + /*need_pad*/ false) + .toTensor(); + } +} + +torch::Tensor OnlineZipformerTransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_.run_method("forward", encoder_out, decoder_out).toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-zipformer-transducer-model.h b/sherpa/csrc/online-zipformer-transducer-model.h new file mode 100644 index 000000000..91370f9ff --- /dev/null +++ b/sherpa/csrc/online-zipformer-transducer-model.h @@ -0,0 +1,85 @@ +// sherpa/csrc/online-zipformer-transducer-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_ZIPFORMER_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_ZIPFORMER_TRANSDUCER_MODEL_H_ +#include +#include +#include +#include + +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { +/** This class implements models from pruned_transducer_stateless7_streaming + * from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OnlineZipformerTransducerModel : public OnlineTransducerModel { + public: + /** Constructor. + * + * @param filename Path to the torchscript model. See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_trace_export.py + * for how to export a model. + * @param decode_chunk_size Number of frames before subsampling + * @param device Move the model to this device on loading. + */ + OnlineZipformerTransducerModel(const std::string &encoder_filename, + const std::string &decoder_filename, + const std::string &joiner_filename, + torch::Device device = torch::kCPU); + + explicit OnlineZipformerTransducerModel(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::IValue StackStates( + const std::vector &states) const override; + + std::vector UnStackStates(torch::IValue states) const override; + + torch::IValue GetEncoderInitStates(int32_t batch_size = 1) override; + + std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) override; + + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + int32_t ContextSize() const override { return context_size_; } + + int32_t ChunkSize() const override { return chunk_size_; } + + int32_t ChunkShift() const override { return chunk_shift_; } + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + + torch::Device device_{"cpu"}; + + int32_t context_size_; + int32_t chunk_size_; + int32_t chunk_shift_; + // true if the model is from torch.jit.trace() + bool from_torch_jit_trace_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_ZIPFORMER_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/online-zipformer2-transducer-model.cc b/sherpa/csrc/online-zipformer2-transducer-model.cc new file mode 100644 index 000000000..72e04c93a --- /dev/null +++ b/sherpa/csrc/online-zipformer2-transducer-model.cc @@ -0,0 +1,206 @@ +// sherpa/csrc/online-zipformer2-transducer-model.cc +// +// Copyright (c) 2023 Xiaomi Corporation +#include "sherpa/csrc/online-zipformer2-transducer-model.h" + +#include +#include +#include +#include +#include + +#include "sherpa/cpp_api/macros.h" + +namespace sherpa { + +OnlineZipformer2TransducerModel::OnlineZipformer2TransducerModel( + const std::string &filename, torch::Device device /*= torch::kCPU*/) + : device_(device) { + model_ = torch::jit::load(filename, device); + model_.eval(); + + encoder_ = model_.attr("encoder").toModule(); + decoder_ = model_.attr("decoder").toModule(); + joiner_ = model_.attr("joiner").toModule(); + + auto conv = decoder_.attr("conv").toModule(); + + context_size_ = + conv.hasattr("weight") ? conv.attr("weight").toTensor().size(2) : 1; + + int32_t pad_length = encoder_.attr("pad_length").toInt(); + + chunk_shift_ = encoder_.attr("chunk_size").toInt() * 2; + chunk_size_ = chunk_shift_ + pad_length; +} + +torch::IValue OnlineZipformer2TransducerModel::StackStates( + const std::vector &_states) const { + InferenceMode no_grad; + + std::vector> states; + states.reserve(_states.size()); + for (const auto &s : _states) { + states.push_back(c10::impl::toTypedList(s.toList())); + } + + std::vector stacked_states; + stacked_states.reserve(states[0].size()); + + int32_t batch_size = static_cast(states.size()); + int32_t num_layers = (static_cast(states[0].size()) - 2) / 6; + + std::vector buf(batch_size); + + std::array batch_dim = {1, 1, 1, 1, 0, 0}; + + for (int32_t i = 0; i != num_layers; ++i) { + // each layer has 6 states + int32_t offset = i * 6; + + for (int32_t s = 0; s != 6; ++s) { + for (int32_t b = 0; b != batch_size; ++b) { + buf[b] = states[b][offset + s]; + } + + stacked_states.push_back(torch::cat(buf, /*dim*/ batch_dim[s])); + } + } + + // for the last two tensors + std::vector buf1(batch_size); + for (int32_t b = 0; b != batch_size; ++b) { + buf[b] = states[b][states[0].size() - 2]; + buf1[b] = states[b][states[0].size() - 1]; + } + + stacked_states.push_back(torch::cat(buf, /*dim*/ 0)); + stacked_states.push_back(torch::cat(buf1, /*dim*/ 0)); + + return stacked_states; +} + +std::vector OnlineZipformer2TransducerModel::UnStackStates( + torch::IValue ivalue) const { + InferenceMode no_grad; + // ivalue is a list + auto list_ptr = ivalue.toList(); + int32_t num_elements = list_ptr.size(); + + std::vector states; + states.reserve(num_elements); + for (int32_t i = 0; i != num_elements; ++i) { + states.emplace_back(list_ptr.get(i).toTensor()); + } + + int32_t num_layers = (states.size() - 2) / 6; + int32_t batch_size = states[0].size(1); + + std::vector> unstacked_states(batch_size); + for (auto &s : unstacked_states) { + s.reserve(states.size()); + } + + std::array batch_dim = {1, 1, 1, 1, 0, 0}; + + for (int32_t i = 0; i != num_layers; ++i) { + int32_t offset = 6 * i; + + for (int32_t s = 0; s != 6; ++s) { + std::vector ss = torch::chunk( + states[offset + s], /*chunks*/ batch_size, /*dim*/ batch_dim[s]); + + for (int32_t b = 0; b != batch_size; ++b) { + unstacked_states[b].push_back(std::move(ss[b])); + } + } + } + + // for the last two tensors + auto ss = + torch::chunk(states[states.size() - 2], /*chunk*/ batch_size, /*dim*/ 0); + for (int32_t b = 0; b != batch_size; ++b) { + unstacked_states[b].push_back(std::move(ss[b])); + } + + ss = torch::chunk(states[states.size() - 1], /*chunk*/ batch_size, /*dim*/ 0); + for (int32_t b = 0; b != batch_size; ++b) { + unstacked_states[b].push_back(std::move(ss[b])); + } + + std::vector ans(batch_size); + for (int32_t n = 0; n != batch_size; ++n) { + // unstacked_states[n] is std::vector + ans[n] = std::move(unstacked_states[n]); + } + + return ans; +} + +torch::IValue OnlineZipformer2TransducerModel::GetEncoderInitStates( + int32_t batch_size /*=1*/) { + InferenceMode no_grad; + auto states = encoder_.run_method("get_init_states", batch_size, device_); + /* states is a list of tensors. States of all layers are concatednated into + a single list. + State of each layer has 6 tensors: + + - s0: (x, batch_size, x) + - s1: (1, batch_size, x, x) + - s2: (x, batch_size, x) + - s3: (x, batch_size, x) + - s4: (batch_size, x, x) + - s5: (batch_size, x, x) + + + In addition, + - states[-1}, a 4-D tensor of shape (batch_size, x, x, x) + - states[-2], a 1-D tensor of shape (batch_size, ) + + If you are curious about the format of the states, please have a look at + - + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer/export.py#L363 + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer/streaming_decode.py#L220 + */ + return states; +} + +std::tuple +OnlineZipformer2TransducerModel::RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) { + InferenceMode no_grad; + + torch::List s_list = + c10::impl::toTypedList(states.toList()); + torch::IValue ivalue = + encoder_.run_method("forward", features, features_length, states); + + auto tuple_ptr = ivalue.toTuple(); + torch::Tensor encoder_out = tuple_ptr->elements()[0].toTensor(); + + torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); + + auto next_states = tuple_ptr->elements()[2]; + + return std::make_tuple(encoder_out, encoder_out_length, next_states); +} + +torch::Tensor OnlineZipformer2TransducerModel::RunDecoder( + const torch::Tensor &decoder_input) { + InferenceMode no_grad; + return decoder_ + .run_method("forward", decoder_input, + /*need_pad*/ false) + .toTensor(); +} + +torch::Tensor OnlineZipformer2TransducerModel::RunJoiner( + const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { + InferenceMode no_grad; + return joiner_ + .run_method("forward", encoder_out, decoder_out, /*project_input*/ true) + .toTensor(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/online-zipformer2-transducer-model.h b/sherpa/csrc/online-zipformer2-transducer-model.h new file mode 100644 index 000000000..05fc8ef82 --- /dev/null +++ b/sherpa/csrc/online-zipformer2-transducer-model.h @@ -0,0 +1,71 @@ +// sherpa/csrc/online-zipformer2-transducer-model.h +// +// Copyright (c) 2023 Xiaomi Corporation +#ifndef SHERPA_CSRC_ONLINE_ZIPFORMER2_TRANSDUCER_MODEL_H_ +#define SHERPA_CSRC_ONLINE_ZIPFORMER2_TRANSDUCER_MODEL_H_ + +#include +#include +#include +#include + +#include "sherpa/csrc/online-transducer-model.h" + +namespace sherpa { + +/** This class implements models from zipformer with causal=True from icefall. + * + * See + * https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer/zipformer.py + * for an instance. + * + * You can find the interface and implementation details of the + * encoder, decoder, and joiner network in the above Python code. + */ +class OnlineZipformer2TransducerModel : public OnlineTransducerModel { + public: + explicit OnlineZipformer2TransducerModel(const std::string &filename, + torch::Device device = torch::kCPU); + + torch::IValue StackStates( + const std::vector &states) const override; + + std::vector UnStackStates(torch::IValue states) const override; + + torch::IValue GetEncoderInitStates(int32_t batch_size = 1) override; + + std::tuple RunEncoder( + const torch::Tensor &features, const torch::Tensor &features_length, + const torch::Tensor &num_processed_frames, torch::IValue states) override; + + torch::Tensor RunDecoder(const torch::Tensor &decoder_input) override; + + torch::Tensor RunJoiner(const torch::Tensor &encoder_out, + const torch::Tensor &decoder_out) override; + + torch::Device Device() const override { return device_; } + + int32_t ContextSize() const override { return context_size_; } + + int32_t ChunkSize() const override { return chunk_size_; } + + int32_t ChunkShift() const override { return chunk_shift_; } + + private: + torch::jit::Module model_; + + // The following modules are just aliases to modules in model_ + torch::jit::Module encoder_; + torch::jit::Module decoder_; + torch::jit::Module joiner_; + + torch::Device device_{"cpu"}; + + int32_t context_size_; + int32_t chunk_size_; + int32_t chunk_shift_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_ONLINE_ZIPFORMER2_TRANSDUCER_MODEL_H_ diff --git a/sherpa/csrc/parse_options.cc b/sherpa/csrc/parse-options.cc similarity index 85% rename from sherpa/csrc/parse_options.cc rename to sherpa/csrc/parse-options.cc index 952f7d017..c47fd624f 100644 --- a/sherpa/csrc/parse_options.cc +++ b/sherpa/csrc/parse-options.cc @@ -22,11 +22,12 @@ // This file is copied and modified from kaldi/src/util/parse-options.cu -#include "sherpa/csrc/parse_options.h" +#include "sherpa/cpp_api/parse-options.h" #include #include +#include #include #include #include @@ -35,133 +36,10 @@ #include #include "sherpa/csrc/log.h" - -#ifdef _MSC_VER -#define SHERPA_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); -#else -#define SHERPA_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); -#endif +#include "sherpa/csrc/text-utils.h" namespace sherpa { -/// Converts a string into an integer via strtoll and returns false if there was -/// any kind of problem (i.e. the string was not an integer or contained extra -/// non-whitespace junk, or the integer was too large to fit into the type it is -/// being converted into). Only sets *out if everything was OK and it returns -/// true. -template -bool ConvertStringToInteger(const std::string &str, Int *out) { - // copied from kaldi/src/util/text-util.h - static_assert(std::is_integral::value, ""); - const char *this_str = str.c_str(); - char *end = nullptr; - errno = 0; - int64_t i = SHERPA_STRTOLL(this_str, &end); - if (end != this_str) { - while (isspace(*end)) ++end; - } - if (end == this_str || *end != '\0' || errno != 0) return false; - Int iInt = static_cast(i); - if (static_cast(iInt) != i || - (i < 0 && !std::numeric_limits::is_signed)) { - return false; - } - *out = iInt; - return true; -} - -// copied from kaldi/src/util/text-util.cc -template -class NumberIstream { - public: - explicit NumberIstream(std::istream &i) : in_(i) {} - - NumberIstream &operator>>(T &x) { - if (!in_.good()) return *this; - in_ >> x; - if (!in_.fail() && RemainderIsOnlySpaces()) return *this; - return ParseOnFail(&x); - } - - private: - std::istream &in_; - - bool RemainderIsOnlySpaces() { - if (in_.tellg() != std::istream::pos_type(-1)) { - std::string rem; - in_ >> rem; - - if (rem.find_first_not_of(' ') != std::string::npos) { - // there is not only spaces - return false; - } - } - - in_.clear(); - return true; - } - - NumberIstream &ParseOnFail(T *x) { - std::string str; - in_.clear(); - in_.seekg(0); - // If the stream is broken even before trying - // to read from it or if there are many tokens, - // it's pointless to try. - if (!(in_ >> str) || !RemainderIsOnlySpaces()) { - in_.setstate(std::ios_base::failbit); - return *this; - } - - std::unordered_map inf_nan_map; - // we'll keep just uppercase values. - inf_nan_map["INF"] = std::numeric_limits::infinity(); - inf_nan_map["+INF"] = std::numeric_limits::infinity(); - inf_nan_map["-INF"] = -std::numeric_limits::infinity(); - inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); - inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); - inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); - // MSVC - inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); - inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); - inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); - inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); - - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - - if (inf_nan_map.find(str) != inf_nan_map.end()) { - *x = inf_nan_map[str]; - } else { - in_.setstate(std::ios_base::failbit); - } - - return *this; - } -}; - -/// ConvertStringToReal converts a string into either float or double -/// and returns false if there was any kind of problem (i.e. the string -/// was not a floating point number or contained extra non-whitespace junk). -/// Be careful- this function will successfully read inf's or nan's. -template -bool ConvertStringToReal(const std::string &str, T *out) { - std::istringstream iss(str); - - NumberIstream i(iss); - - i >> *out; - - if (iss.fail()) { - // Number conversion failed. - return false; - } - - return true; -} - ParseOptions::ParseOptions(const std::string &prefix, ParseOptions *po) : print_args_(false), help_(false), usage_(""), argc_(0), argv_(nullptr) { if (po != nullptr && po->other_parser_ != nullptr) { @@ -701,14 +579,14 @@ bool ParseOptions::ToBool(std::string str) const { } int32_t ParseOptions::ToInt(const std::string &str) const { - int32_t ret; + int32_t ret = 0; if (!ConvertStringToInteger(str, &ret)) SHERPA_LOG(FATAL) << "Invalid integer option \"" << str << "\""; return ret; } uint32_t ParseOptions::ToUint(const std::string &str) const { - uint32_t ret; + uint32_t ret = 0; if (!ConvertStringToInteger(str, &ret)) SHERPA_LOG(FATAL) << "Invalid integer option \"" << str << "\""; return ret; diff --git a/sherpa/csrc/resample.cc b/sherpa/csrc/resample.cc new file mode 100644 index 000000000..71577d148 --- /dev/null +++ b/sherpa/csrc/resample.cc @@ -0,0 +1,315 @@ +/** + * Copyright 2013 Pegah Ghahremani + * 2014 IMSL, PKU-HKUST (author: Wei Shi) + * 2014 Yanqing Sun, Junjie Wang + * 2014 Johns Hopkins University (author: Daniel Povey) + * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// this file is copied and modified from +// kaldi/src/feat/resample.cc + +#include "sherpa/csrc/resample.h" + +#include +#include +#include + +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +namespace sherpa { + +template +I Gcd(I m, I n) { + // this function is copied from kaldi/src/base/kaldi-math.h + if (m == 0 || n == 0) { + if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. + fprintf(stderr, "Undefined GCD since m = 0, n = 0."); + exit(-1); + } + return (m == 0 ? (n > 0 ? n : -n) : (m > 0 ? m : -m)); + // return absolute value of whichever is nonzero + } + // could use compile-time assertion + // but involves messing with complex template stuff. + static_assert(std::is_integral::value, ""); + while (1) { + m %= n; + if (m == 0) return (n > 0 ? n : -n); + n %= m; + if (n == 0) return (m > 0 ? m : -m); + } +} + +/// Returns the least common multiple of two integers. Will +/// crash unless the inputs are positive. +template +I Lcm(I m, I n) { + // This function is copied from kaldi/src/base/kaldi-math.h + assert(m > 0 && n > 0); + I gcd = Gcd(m, n); + return gcd * (m / gcd) * (n / gcd); +} + +static float DotProduct(const float *a, const float *b, int32_t n) { + float sum = 0; + for (int32_t i = 0; i != n; ++i) { + sum += a[i] * b[i]; + } + return sum; +} + +LinearResample::LinearResample(int32_t samp_rate_in_hz, + int32_t samp_rate_out_hz, float filter_cutoff_hz, + int32_t num_zeros) + : samp_rate_in_(samp_rate_in_hz), + samp_rate_out_(samp_rate_out_hz), + filter_cutoff_(filter_cutoff_hz), + num_zeros_(num_zeros) { + assert(samp_rate_in_hz > 0.0 && samp_rate_out_hz > 0.0 && + filter_cutoff_hz > 0.0 && filter_cutoff_hz * 2 <= samp_rate_in_hz && + filter_cutoff_hz * 2 <= samp_rate_out_hz && num_zeros > 0); + + // base_freq is the frequency of the repeating unit, which is the gcd + // of the input frequencies. + int32_t base_freq = Gcd(samp_rate_in_, samp_rate_out_); + input_samples_in_unit_ = samp_rate_in_ / base_freq; + output_samples_in_unit_ = samp_rate_out_ / base_freq; + + SetIndexesAndWeights(); + Reset(); +} + +void LinearResample::SetIndexesAndWeights() { + first_index_.resize(output_samples_in_unit_); + weights_.resize(output_samples_in_unit_); + + double window_width = num_zeros_ / (2.0 * filter_cutoff_); + + for (int32_t i = 0; i < output_samples_in_unit_; i++) { + double output_t = i / static_cast(samp_rate_out_); + double min_t = output_t - window_width, max_t = output_t + window_width; + // we do ceil on the min and floor on the max, because if we did it + // the other way around we would unnecessarily include indexes just + // outside the window, with zero coefficients. It's possible + // if the arguments to the ceil and floor expressions are integers + // (e.g. if filter_cutoff_ has an exact ratio with the sample rates), + // that we unnecessarily include something with a zero coefficient, + // but this is only a slight efficiency issue. + int32_t min_input_index = ceil(min_t * samp_rate_in_), + max_input_index = floor(max_t * samp_rate_in_), + num_indices = max_input_index - min_input_index + 1; + first_index_[i] = min_input_index; + weights_[i].resize(num_indices); + for (int32_t j = 0; j < num_indices; j++) { + int32_t input_index = min_input_index + j; + double input_t = input_index / static_cast(samp_rate_in_), + delta_t = input_t - output_t; + // sign of delta_t doesn't matter. + weights_[i][j] = FilterFunc(delta_t) / samp_rate_in_; + } + } +} + +/** Here, t is a time in seconds representing an offset from + the center of the windowed filter function, and FilterFunction(t) + returns the windowed filter function, described + in the header as h(t) = f(t)g(t), evaluated at t. +*/ +float LinearResample::FilterFunc(float t) const { + float window, // raised-cosine (Hanning) window of width + // num_zeros_/2*filter_cutoff_ + filter; // sinc filter function + if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) + window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t)); + else + window = 0.0; // outside support of window function + if (t != 0) + filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t); + else + filter = 2 * filter_cutoff_; // limit of the function at t = 0 + return filter * window; +} + +void LinearResample::Reset() { + input_sample_offset_ = 0; + output_sample_offset_ = 0; + input_remainder_.resize(0); +} + +torch::Tensor LinearResample::Resample(torch::Tensor input_tensor, bool flush) { + const float *input = input_tensor.data_ptr(); + int32_t input_dim = input_tensor.numel(); + + int64_t tot_input_samp = input_sample_offset_ + input_dim, + tot_output_samp = GetNumOutputSamples(tot_input_samp, flush); + + assert(tot_output_samp >= output_sample_offset_); + + torch::Tensor output = + torch::zeros({tot_output_samp - output_sample_offset_}, torch::kFloat); + float *p_output = output.data_ptr(); + + // samp_out is the index into the total output signal, not just the part + // of it we are producing here. + for (int64_t samp_out = output_sample_offset_; samp_out < tot_output_samp; + samp_out++) { + int64_t first_samp_in; + int32_t samp_out_wrapped; + GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped); + const std::vector &weights = weights_[samp_out_wrapped]; + // first_input_index is the first index into "input" that we have a weight + // for. + int32_t first_input_index = + static_cast(first_samp_in - input_sample_offset_); + float this_output; + if (first_input_index >= 0 && + first_input_index + static_cast(weights.size()) <= input_dim) { + this_output = + DotProduct(input + first_input_index, weights.data(), weights.size()); + } else { // Handle edge cases. + this_output = 0.0; + for (int32_t i = 0; i < static_cast(weights.size()); i++) { + float weight = weights[i]; + int32_t input_index = first_input_index + i; + if (input_index < 0 && + static_cast(input_remainder_.size()) + input_index >= 0) { + this_output += + weight * input_remainder_[input_remainder_.size() + input_index]; + } else if (input_index >= 0 && input_index < input_dim) { + this_output += weight * input[input_index]; + } else if (input_index >= input_dim) { + // We're past the end of the input and are adding zero; should only + // happen if the user specified flush == true, or else we would not + // be trying to output this sample. + assert(flush); + } + } + } + int32_t output_index = + static_cast(samp_out - output_sample_offset_); + p_output[output_index] = this_output; + } + + if (flush) { + Reset(); // Reset the internal state. + } else { + SetRemainder(input, input_dim); + input_sample_offset_ = tot_input_samp; + output_sample_offset_ = tot_output_samp; + } + + return output; +} + +int64_t LinearResample::GetNumOutputSamples(int64_t input_num_samp, + bool flush) const { + // For exact computation, we measure time in "ticks" of 1.0 / tick_freq, + // where tick_freq is the least common multiple of samp_rate_in_ and + // samp_rate_out_. + int32_t tick_freq = Lcm(samp_rate_in_, samp_rate_out_); + int32_t ticks_per_input_period = tick_freq / samp_rate_in_; + + // work out the number of ticks in the time interval + // [ 0, input_num_samp/samp_rate_in_ ). + int64_t interval_length_in_ticks = input_num_samp * ticks_per_input_period; + if (!flush) { + float window_width = num_zeros_ / (2.0 * filter_cutoff_); + // To count the window-width in ticks we take the floor. This + // is because since we're looking for the largest integer num-out-samp + // that fits in the interval, which is open on the right, a reduction + // in interval length of less than a tick will never make a difference. + // For example, the largest integer in the interval [ 0, 2 ) and the + // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one). + // So when we're subtracting the window-width we can ignore the fractional + // part. + int32_t window_width_ticks = floor(window_width * tick_freq); + // The time-period of the output that we can sample gets reduced + // by the window-width (which is actually the distance from the + // center to the edge of the windowing function) if we're not + // "flushing the output". + interval_length_in_ticks -= window_width_ticks; + } + if (interval_length_in_ticks <= 0) return 0; + + int32_t ticks_per_output_period = tick_freq / samp_rate_out_; + // Get the last output-sample in the closed interval, i.e. replacing [ ) with + // [ ]. Note: integer division rounds down. See + // http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of + // the notation. + int64_t last_output_samp = interval_length_in_ticks / ticks_per_output_period; + // We need the last output-sample in the open interval, so if it takes us to + // the end of the interval exactly, subtract one. + if (last_output_samp * ticks_per_output_period == interval_length_in_ticks) + last_output_samp--; + + // First output-sample index is zero, so the number of output samples + // is the last output-sample plus one. + int64_t num_output_samp = last_output_samp + 1; + return num_output_samp; +} + +// inline +void LinearResample::GetIndexes(int64_t samp_out, int64_t *first_samp_in, + int32_t *samp_out_wrapped) const { + // A unit is the smallest nonzero amount of time that is an exact + // multiple of the input and output sample periods. The unit index + // is the answer to "which numbered unit we are in". + int64_t unit_index = samp_out / output_samples_in_unit_; + // samp_out_wrapped is equal to samp_out % output_samples_in_unit_ + *samp_out_wrapped = + static_cast(samp_out - unit_index * output_samples_in_unit_); + *first_samp_in = + first_index_[*samp_out_wrapped] + unit_index * input_samples_in_unit_; +} + +void LinearResample::SetRemainder(const float *input, int32_t input_dim) { + std::vector old_remainder(input_remainder_); + // max_remainder_needed is the width of the filter from side to side, + // measured in input samples. you might think it should be half that, + // but you have to consider that you might be wanting to output samples + // that are "in the past" relative to the beginning of the latest + // input... anyway, storing more remainder than needed is not harmful. + int32_t max_remainder_needed = + ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_); + input_remainder_.resize(max_remainder_needed); + for (int32_t index = -static_cast(input_remainder_.size()); + index < 0; index++) { + // we interpret "index" as an offset from the end of "input" and + // from the end of input_remainder_. + int32_t input_index = index + input_dim; + if (input_index >= 0) { + input_remainder_[index + static_cast(input_remainder_.size())] = + input[input_index]; + } else if (input_index + static_cast(old_remainder.size()) >= 0) { + input_remainder_[index + static_cast(input_remainder_.size())] = + old_remainder[input_index + + static_cast(old_remainder.size())]; + // else leave it at zero. + } + } +} + +} // namespace sherpa diff --git a/sherpa/csrc/resample.h b/sherpa/csrc/resample.h new file mode 100644 index 000000000..3d60417d3 --- /dev/null +++ b/sherpa/csrc/resample.h @@ -0,0 +1,147 @@ +/** + * Copyright 2013 Pegah Ghahremani + * 2014 IMSL, PKU-HKUST (author: Wei Shi) + * 2014 Yanqing Sun, Junjie Wang + * 2014 Johns Hopkins University (author: Daniel Povey) + * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// this file is copied and modified from +// kaldi/src/feat/resample.h +#ifndef SHERPA_CSRC_RESAMPLE_H_ +#define SHERPA_CSRC_RESAMPLE_H_ + +#include +#include + +#include "torch/script.h" + +namespace sherpa { + +/* + We require that the input and output sampling rate be specified as + integers, as this is an easy way to specify that their ratio be rational. +*/ + +class LinearResample { + public: + /// Constructor. We make the input and output sample rates integers, because + /// we are going to need to find a common divisor. This should just remind + /// you that they need to be integers. The filter cutoff needs to be less + /// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros + /// controls the sharpness of the filter, more == sharper but less efficient. + /// We suggest around 4 to 10 for normal use. + LinearResample(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, + float filter_cutoff_hz, int32_t num_zeros); + + /// Calling the function Reset() resets the state of the object prior to + /// processing a new signal; it is only necessary if you have called + /// Resample(x, x_size, false, y) for some signal, leading to a remainder of + /// the signal being called, but then abandon processing the signal before + /// calling Resample(x, x_size, true, y) for the last piece. Call it + /// unnecessarily between signals will not do any harm. + void Reset(); + + /// This function does the resampling. If you call it with flush == true and + /// you have never called it with flush == false, it just resamples the input + /// signal (it resizes the output to a suitable number of samples). + /// + /// You can also use this function to process a signal a piece at a time. + /// suppose you break it into piece1, piece2, ... pieceN. You can call + /// \code{.cc} + /// Resample(piece1, piece1_size, false, &output1); + /// Resample(piece2, piece2_size, false, &output2); + /// Resample(piece3, piece3_size, true, &output3); + /// \endcode + /// If you call it with flush == false, it won't output the last few samples + /// but will remember them, so that if you later give it a second piece of + /// the input signal it can process it correctly. + /// If your most recent call to the object was with flush == false, it will + /// have internal state; you can remove this by calling Reset(). + /// Empty input is acceptable. + /// + /// Note: We will resize output on return + torch::Tensor Resample(torch::Tensor input_tensor, bool flush); + + //// Return the input and output sampling rates (for checks, for example) + int32_t GetInputSamplingRate() const { return samp_rate_in_; } + int32_t GetOutputSamplingRate() const { return samp_rate_out_; } + + private: + void SetIndexesAndWeights(); + + float FilterFunc(float) const; + + /// This function outputs the number of output samples we will output + /// for a signal with "input_num_samp" input samples. If flush == true, + /// we return the largest n such that + /// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ), + /// and note that the interval is half-open. If flush == false, + /// define window_width as num_zeros / (2.0 * filter_cutoff_); + /// we return the largest n such that (n/samp_rate_out_) is in the interval + /// [ 0, input_num_samp/samp_rate_in_ - window_width ). + int64_t GetNumOutputSamples(int64_t input_num_samp, bool flush) const; + + /// Given an output-sample index, this function outputs to *first_samp_in the + /// first input-sample index that we have a weight on (may be negative), + /// and to *samp_out_wrapped the index into weights_ where we can get the + /// corresponding weights on the input. + inline void GetIndexes(int64_t samp_out, int64_t *first_samp_in, + int32_t *samp_out_wrapped) const; + + void SetRemainder(const float *input, int32_t input_dim); + + private: + // The following variables are provided by the user. + int32_t samp_rate_in_; + int32_t samp_rate_out_; + float filter_cutoff_; + int32_t num_zeros_; + + int32_t input_samples_in_unit_; ///< The number of input samples in the + ///< smallest repeating unit: num_samp_in_ = + ///< samp_rate_in_hz / Gcd(samp_rate_in_hz, + ///< samp_rate_out_hz) + + int32_t output_samples_in_unit_; ///< The number of output samples in the + ///< smallest repeating unit: num_samp_out_ + ///< = samp_rate_out_hz / + ///< Gcd(samp_rate_in_hz, samp_rate_out_hz) + + /// The first input-sample index that we sum over, for this output-sample + /// index. May be negative; any truncation at the beginning is handled + /// separately. This is just for the first few output samples, but we can + /// extrapolate the correct input-sample index for arbitrary output samples. + std::vector first_index_; + + /// Weights on the input samples, for this output-sample index. + std::vector> weights_; + + // the following variables keep track of where we are in a particular signal, + // if it is being provided over multiple calls to Resample(). + + int64_t input_sample_offset_; ///< The number of input samples we have + ///< already received for this signal + ///< (including anything in remainder_) + int64_t output_sample_offset_; ///< The number of samples we have already + ///< output for this signal. + std::vector input_remainder_; ///< A small trailing part of the + ///< previously seen input signal. +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_RESAMPLE_H_ diff --git a/sherpa/csrc/rnnt_beam_search.cc b/sherpa/csrc/rnnt_beam_search.cc deleted file mode 100644 index 561154b81..000000000 --- a/sherpa/csrc/rnnt_beam_search.cc +++ /dev/null @@ -1,542 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/csrc/rnnt_beam_search.h" - -#include -#include -#include - -#include "k2/torch_api.h" -#include "sherpa/csrc/hypothesis.h" -#include "sherpa/csrc/rnnt_conformer_model.h" -#include "sherpa/csrc/rnnt_emformer_model.h" -#include "sherpa/csrc/rnnt_model.h" -#include "torch/all.h" - -namespace sherpa { - -static inline torch::Tensor FloorDivide(torch::Tensor a, int32_t b) { -#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ - (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR > 7) - return torch::div(a, b, /*rounding_mode*/ "trunc"); -#else - return torch::floor_divide(a, b); -#endif -} - -/** - * Construct the decoder input from the current hypothesis. - * - * @param hyps A list-of-list of token IDs containing the current decoding - * results. Its length is `batch_size` - * @param decoder_input A 2-D tensor of shape (batch_size, context_size). - */ -static void BuildDecoderInput(const std::vector> &hyps, - torch::Tensor *decoder_input) { - int32_t batch_size = decoder_input->size(0); - int32_t context_size = decoder_input->size(1); - int64_t *p = decoder_input->data_ptr(); - for (int32_t i = 0; i != batch_size; ++i) { - auto start = hyps[i].end() - context_size; - auto end = hyps[i].end(); - std::copy(start, end, p); - p += context_size; - } -} - -static torch::Tensor BuildDecoderInput(const std::vector &hyps, - int32_t context_size) { - int32_t num_hyps = hyps.size(); - torch::Tensor decoder_input = - torch::empty({num_hyps, context_size}, - torch::dtype(torch::kLong) - .memory_format(torch::MemoryFormat::Contiguous)); - - int64_t *p = decoder_input.data_ptr(); - for (const auto &h : hyps) { - auto start = h.ys.end() - context_size; - auto end = h.ys.end(); - - std::copy(start, end, p); - p += context_size; - } - - return decoder_input; -} - -/** Return a ragged shape with axes [utt][num_hyps]. - * - * @param hyps hyps.size() == batch_size. Each entry contains the active - * hypotheses of an utterance. - * @return Return a ragged shape with 2 axes [utt][num_hyps]. Note that the - * shape is on CPU. - */ -static k2::RaggedShapePtr GetHypsShape(const std::vector &hyps) { - int32_t num_utt = hyps.size(); - torch::Tensor row_splits = torch::empty( - {num_utt + 1}, - torch::dtype(torch::kInt).memory_format(torch::MemoryFormat::Contiguous)); - auto row_splits_acc = row_splits.accessor(); - for (int32_t i = 0; i != num_utt; ++i) { - row_splits_acc[i] = hyps[i].Size(); - } - - k2::ExclusiveSum(row_splits, &row_splits); - - return k2::RaggedShape2(row_splits, torch::Tensor(), row_splits_acc[num_utt]); -} - -std::vector> GreedySearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, torch::Tensor encoder_out_length) { - TORCH_CHECK(encoder_out.dim() == 3, "encoder_out.dim() is ", - encoder_out.dim(), "Expected value is 3"); - TORCH_CHECK(encoder_out.scalar_type() == torch::kFloat, - "encoder_out.scalar_type() is ", encoder_out.scalar_type()); - - TORCH_CHECK(encoder_out_length.dim() == 1, "encoder_out_length.dim() is", - encoder_out_length.dim()); - TORCH_CHECK(encoder_out_length.scalar_type() == torch::kLong, - "encoder_out_length.scalar_type() is ", - encoder_out_length.scalar_type()); - - TORCH_CHECK(encoder_out_length.device().is_cpu()); - - torch::Device device = model.Device(); - encoder_out = encoder_out.to(device); - - torch::nn::utils::rnn::PackedSequence packed_seq = - torch::nn::utils::rnn::pack_padded_sequence(encoder_out, - encoder_out_length, - /*batch_first*/ true, - /*enforce_sorted*/ false); - - auto projected_encoder_out = model.ForwardEncoderProj(packed_seq.data()); - - int32_t blank_id = model.BlankId(); - int32_t unk_id = model.UnkId(); - int32_t context_size = model.ContextSize(); - - int32_t batch_size = encoder_out_length.size(0); - - std::vector blanks(context_size, blank_id); - std::vector> hyps(batch_size, blanks); - - auto decoder_input = - torch::full({batch_size, context_size}, blank_id, - torch::dtype(torch::kLong) - .memory_format(torch::MemoryFormat::Contiguous)); - auto decoder_out = model.ForwardDecoder(decoder_input.to(device)); - decoder_out = model.ForwardDecoderProj(decoder_out); - // decoder_out's shape is (batch_size, 1, joiner_dim) - - using torch::indexing::Slice; - auto batch_sizes_accessor = packed_seq.batch_sizes().accessor(); - int32_t num_batches = packed_seq.batch_sizes().numel(); - int32_t offset = 0; - for (int32_t i = 0; i != num_batches; ++i) { - int32_t cur_batch_size = batch_sizes_accessor[i]; - int32_t start = offset; - int32_t end = start + cur_batch_size; - auto cur_encoder_out = projected_encoder_out.index({Slice(start, end)}); - offset = end; - - cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); - // Now cur_encoder_out's shape is (cur_batch_size, 1, 1, joiner_dim) - if (cur_batch_size < decoder_out.size(0)) { - decoder_out = decoder_out.index({Slice(0, cur_batch_size)}); - } - - auto logits = - model.ForwardJoiner(cur_encoder_out, decoder_out.unsqueeze(1)); - // logits' shape is (cur_batch_size, 1, 1, vocab_size) - - logits = logits.squeeze(1).squeeze(1); - auto max_indices = logits.argmax(/*dim*/ -1).cpu(); - auto max_indices_accessor = max_indices.accessor(); - bool emitted = false; - for (int32_t k = 0; k != cur_batch_size; ++k) { - auto index = max_indices_accessor[k]; - if (index != blank_id && index != unk_id) { - emitted = true; - hyps[k].push_back(index); - } - } - - if (emitted) { - if (cur_batch_size < decoder_input.size(0)) { - decoder_input = decoder_input.index({Slice(0, cur_batch_size)}); - } - BuildDecoderInput(hyps, &decoder_input); - decoder_out = model.ForwardDecoder(decoder_input.to(device)); - decoder_out = model.ForwardDecoderProj(decoder_out); - } - } - - auto unsorted_indices = packed_seq.unsorted_indices().cpu(); - auto unsorted_indices_accessor = unsorted_indices.accessor(); - - std::vector> ans(batch_size); - - for (int32_t i = 0; i != batch_size; ++i) { - torch::ArrayRef arr(hyps[unsorted_indices_accessor[i]]); - ans[i] = arr.slice(context_size).vec(); - } - - return ans; -} - -torch::Tensor StreamingGreedySearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, torch::Tensor decoder_out, - std::vector> *hyps, - std::vector *num_trailing_blank_frames) { - TORCH_CHECK(encoder_out.dim() == 3, encoder_out.dim(), " vs ", 3); - TORCH_CHECK(decoder_out.dim() == 2, decoder_out.dim(), " vs ", 2); - - TORCH_CHECK(encoder_out.size(0) == decoder_out.size(0), encoder_out.size(0), - " vs ", decoder_out.size(0)); - - TORCH_CHECK(encoder_out.size(0) == hyps->size(), encoder_out.size(0), " vs ", - hyps->size()); - - TORCH_CHECK(hyps->size() == num_trailing_blank_frames->size(), hyps->size(), - " vs ", num_trailing_blank_frames->size()); - - auto device = model.Device(); - int32_t blank_id = model.BlankId(); - int32_t unk_id = model.UnkId(); - int32_t context_size = model.ContextSize(); - - int32_t N = encoder_out.size(0); - int32_t T = encoder_out.size(1); - - auto decoder_input = - torch::full({N, context_size}, blank_id, - torch::dtype(torch::kLong) - .memory_format(torch::MemoryFormat::Contiguous)); - - encoder_out = model.ForwardEncoderProj(encoder_out); - - for (int32_t t = 0; t != T; ++t) { - auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), t}); - - auto logits = model.ForwardJoiner(cur_encoder_out, decoder_out); - auto max_indices = logits.argmax(/*dim*/ -1).cpu(); - auto max_indices_accessor = max_indices.accessor(); - bool emitted = false; - for (int32_t n = 0; n != N; ++n) { - auto index = max_indices_accessor[n]; - if (index != blank_id && index != unk_id) { - emitted = true; - (*hyps)[n].push_back(index); - (*num_trailing_blank_frames)[n] = 0; - } else { - (*num_trailing_blank_frames)[n] += 1; - } - } - - if (emitted) { - BuildDecoderInput(*hyps, &decoder_input); - decoder_out = model.ForwardDecoder(decoder_input.to(device)).squeeze(1); - decoder_out = model.ForwardDecoderProj(decoder_out); - } - } - return decoder_out; -} - -std::vector> ModifiedBeamSearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, torch::Tensor encoder_out_length, - int32_t num_active_paths /*=4*/) { - TORCH_CHECK(encoder_out.dim() == 3, "encoder_out.dim() is ", - encoder_out.dim(), "Expected value is 3"); - TORCH_CHECK(encoder_out.scalar_type() == torch::kFloat, - "encoder_out.scalar_type() is ", encoder_out.scalar_type()); - - TORCH_CHECK(encoder_out_length.dim() == 1, "encoder_out_length.dim() is", - encoder_out_length.dim()); - TORCH_CHECK(encoder_out_length.scalar_type() == torch::kLong, - "encoder_out_length.scalar_type() is ", - encoder_out_length.scalar_type()); - - TORCH_CHECK(encoder_out_length.device().is_cpu()); - - torch::Device device = model.Device(); - encoder_out = encoder_out.to(device); - - torch::nn::utils::rnn::PackedSequence packed_seq = - torch::nn::utils::rnn::pack_padded_sequence(encoder_out, - encoder_out_length, - /*batch_first*/ true, - /*enforce_sorted*/ false); - - auto projected_encoder_out = model.ForwardEncoderProj(packed_seq.data()); - - int32_t blank_id = model.BlankId(); - int32_t unk_id = model.UnkId(); - int32_t context_size = model.ContextSize(); - - int32_t batch_size = encoder_out_length.size(0); - - std::vector blanks(context_size, blank_id); - Hypotheses blank_hyp({{blanks, 0}}); - - std::deque finalized; - std::vector cur(batch_size, blank_hyp); - std::vector prev; - - using torch::indexing::Slice; - auto batch_sizes_acc = packed_seq.batch_sizes().accessor(); - int32_t num_batches = packed_seq.batch_sizes().numel(); - int32_t offset = 0; - - for (int32_t i = 0; i != num_batches; ++i) { - int32_t cur_batch_size = batch_sizes_acc[i]; - int32_t start = offset; - int32_t end = start + cur_batch_size; - auto cur_encoder_out = projected_encoder_out.index({Slice(start, end)}); - offset = end; - - cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); - // Now cur_encoder_out's shape is (cur_batch_size, 1, 1, joiner_dim) - - if (cur_batch_size < cur.size()) { - for (int32_t k = static_cast(cur.size()) - 1; - k >= cur_batch_size; --k) { - finalized.push_front(std::move(cur[k])); - } - cur.erase(cur.begin() + cur_batch_size, cur.end()); - } - - // Due to merging paths with identical token sequences, - // not all utterances have "num_active_paths" paths. - auto hyps_shape = GetHypsShape(cur); - int32_t num_hyps = k2::TotSize(hyps_shape, 1); - - prev.clear(); - prev.reserve(num_hyps); - for (auto &hyps : cur) { - for (auto &h : hyps) { - prev.push_back(std::move(h.second)); - } - } - cur.clear(); - cur.reserve(cur_batch_size); - - auto ys_log_probs = torch::empty({num_hyps, 1}, torch::kFloat); - - auto ys_log_probs_acc = ys_log_probs.accessor(); - for (int32_t k = 0; k != prev.size(); ++k) { - ys_log_probs_acc[k][0] = prev[k].log_prob; - } - - auto decoder_input = BuildDecoderInput(prev, context_size).to(device); - - auto decoder_out = model.ForwardDecoder(decoder_input); - decoder_out = model.ForwardDecoderProj(decoder_out); - // decoder_out is of shape (num_hyps, 1, joiner_dim) - - auto index = k2::RowIds(hyps_shape, 1).to(torch::kLong).to(device); - - cur_encoder_out = cur_encoder_out.index_select(/*dim*/ 0, /*index*/ index); - // cur_encoder_out is of shape (num_hyps, 1, 1, joiner_dim) - - auto logits = - model.ForwardJoiner(cur_encoder_out, decoder_out.unsqueeze(1)); - - // logits' shape is (num_hyps, 1, 1, vocab_size) - logits = logits.squeeze(1).squeeze(1); - // now logits' shape is (num_hyps, vocab_size) - - auto log_probs = logits.log_softmax(-1).cpu(); - - log_probs.add_(ys_log_probs); - - int32_t vocab_size = log_probs.size(1); - log_probs = log_probs.reshape(-1); - auto row_splits = k2::RowSplits(hyps_shape, 1); - auto row_splits_acc = row_splits.accessor(); - - for (int32_t k = 0; k != cur_batch_size; ++k) { - int32_t start = row_splits_acc[k]; - int32_t end = row_splits_acc[k + 1]; - - torch::Tensor values, indexes; - std::tie(values, indexes) = - log_probs.slice(/*dim*/ 0, start * vocab_size, end * vocab_size) - .topk(/*k*/ num_active_paths, /*dim*/ 0, - /*largest*/ true, /*sorted*/ true); - - auto topk_hyp_indexes = FloorDivide(indexes, vocab_size); - auto topk_token_indexes = torch::remainder(indexes, vocab_size); - - auto values_acc = values.accessor(); - auto topk_hyp_indexes_acc = topk_hyp_indexes.accessor(); - auto topk_token_indexes_acc = topk_token_indexes.accessor(); - - Hypotheses hyps; - for (int32_t j = 0; j != values.numel(); ++j) { - int32_t hyp_idx = topk_hyp_indexes_acc[j]; - Hypothesis new_hyp = prev[start + hyp_idx]; // note: hyp_idx is 0 based - - int32_t new_token = topk_token_indexes_acc[j]; - if (new_token != blank_id && new_token != unk_id) { - new_hyp.ys.push_back(new_token); - } - - // We already added log_prob of the path to log_probs before, so - // we use values_acc[j] here directly. - new_hyp.log_prob = values_acc[j]; - hyps.Add(std::move(new_hyp)); - } - cur.push_back(std::move(hyps)); - } - } - - for (auto &h : finalized) { - cur.push_back(std::move(h)); - } - - auto unsorted_indices = packed_seq.unsorted_indices().cpu(); - auto unsorted_indices_accessor = unsorted_indices.accessor(); - - std::vector> ans(batch_size); - for (int32_t i = 0; i != batch_size; ++i) { - Hypothesis hyp = cur[unsorted_indices_accessor[i]].GetMostProbable(true); - torch::ArrayRef arr(hyp.ys); - ans[i] = arr.slice(context_size).vec(); - } - - return ans; -} - -std::vector StreamingModifiedBeamSearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, std::vector in_hyps, - int32_t num_active_paths /*= 4*/) { - TORCH_CHECK(encoder_out.dim() == 3, encoder_out.dim(), " vs ", 3); - - auto device = model.Device(); - int32_t blank_id = model.BlankId(); - int32_t unk_id = model.UnkId(); - int32_t context_size = model.ContextSize(); - - int32_t N = encoder_out.size(0); - int32_t T = encoder_out.size(1); - - encoder_out = model.ForwardEncoderProj(encoder_out); - - std::vector cur = std::move(in_hyps); - std::vector prev; - - for (int32_t t = 0; t != T; ++t) { - auto cur_encoder_out = encoder_out.index({torch::indexing::Slice(), t}); - cur_encoder_out = cur_encoder_out.unsqueeze(1).unsqueeze(1); - // Now cur_encoder_out's shape is (N, 1, 1, joiner_dim) - - // Due to merging paths with identical token sequences, - // not all utterances have "num_active_paths" paths. - auto hyps_shape = GetHypsShape(cur); - int32_t num_hyps = k2::TotSize(hyps_shape, 1); - - prev.clear(); - prev.reserve(num_hyps); - for (auto &hyps : cur) { - for (auto &h : hyps) { - prev.push_back(std::move(h.second)); - } - } - cur.clear(); - cur.reserve(N); - - auto ys_log_probs = torch::empty({num_hyps, 1}, torch::kFloat); - - auto ys_log_probs_acc = ys_log_probs.accessor(); - for (int32_t k = 0; k != num_hyps; ++k) { - ys_log_probs_acc[k][0] = prev[k].log_prob; - } - - auto decoder_input = BuildDecoderInput(prev, context_size).to(device); - auto decoder_out = model.ForwardDecoder(decoder_input); - decoder_out = model.ForwardDecoderProj(decoder_out); - // decoder_out is of shape (num_hyps, 1, joiner_dim) - - auto index = k2::RowIds(hyps_shape, 1).to(torch::kLong).to(device); - cur_encoder_out = cur_encoder_out.index_select(/*dim*/ 0, /*index*/ index); - // cur_encoder_out is of shape (num_hyps, 1, 1, joiner_dim) - - auto logits = - model.ForwardJoiner(cur_encoder_out, decoder_out.unsqueeze(1)); - // logits' shape is (num_hyps, 1, 1, vocab_size) - logits = logits.squeeze(1).squeeze(1); - // now logits' shape is (num_hyps, vocab_size) - - auto log_probs = logits.log_softmax(-1).cpu(); - - log_probs.add_(ys_log_probs); - - int32_t vocab_size = log_probs.size(1); - log_probs = log_probs.reshape(-1); - auto row_splits = k2::RowSplits(hyps_shape, 1); - auto row_splits_acc = row_splits.accessor(); - - for (int32_t k = 0; k != N; ++k) { - int32_t start = row_splits_acc[k]; - int32_t end = row_splits_acc[k + 1]; - - torch::Tensor values, indexes; - std::tie(values, indexes) = - log_probs.slice(/*dim*/ 0, start * vocab_size, end * vocab_size) - .topk(/*k*/ num_active_paths, /*dim*/ 0, - /*largest*/ true, /*sorted*/ true); - - auto topk_hyp_indexes = FloorDivide(indexes, vocab_size); - auto topk_token_indexes = torch::remainder(indexes, vocab_size); - - auto values_acc = values.accessor(); - auto topk_hyp_indexes_acc = topk_hyp_indexes.accessor(); - auto topk_token_indexes_acc = topk_token_indexes.accessor(); - - Hypotheses hyps; - for (int32_t j = 0; j != values.numel(); ++j) { - int32_t hyp_idx = topk_hyp_indexes_acc[j]; - Hypothesis new_hyp = prev[start + hyp_idx]; // note: hyp_idx is 0 based - - int32_t new_token = topk_token_indexes_acc[j]; - if (new_token != blank_id && new_token != unk_id) { - new_hyp.ys.push_back(new_token); - new_hyp.num_trailing_blanks = 0; - } else { - new_hyp.num_trailing_blanks += 1; - } - - // We already added log_prob of the path to log_probs before, so - // we use values_acc[j] here directly. - new_hyp.log_prob = values_acc[j]; - hyps.Add(std::move(new_hyp)); - } - cur.push_back(std::move(hyps)); - } // for (int32_t k = 0; k != N; ++k) - } // for (int32_t t = 0; t != T; ++t) - - return cur; -} - -} // namespace sherpa diff --git a/sherpa/csrc/rnnt_beam_search.h b/sherpa/csrc/rnnt_beam_search.h deleted file mode 100644 index 160a16dfc..000000000 --- a/sherpa/csrc/rnnt_beam_search.h +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CSRC_RNNT_BEAM_SEARCH_H_ -#define SHERPA_CSRC_RNNT_BEAM_SEARCH_H_ - -#include - -#include "sherpa/csrc/hypothesis.h" -#include "sherpa/csrc/rnnt_conformer_model.h" -#include "sherpa/csrc/rnnt_emformer_model.h" -#include "sherpa/csrc/rnnt_model.h" - -namespace sherpa { - -/** RNN-T greedy search decoding by limiting the max symbol per frame to one. - * - * @param model The RNN-T model. - * - * @param encoder_out Output from the encoder network. Its shape is - * (batch_size, T, encoder_out_dim) and its dtype is - * torch::kFloat. It should be on the same device as `model`. - * - * @param encoder_out_lens A 1-D tensor containing the valid frames before - * padding in `encoder_out`. Its dtype is torch.kLong - * and its shape is (batch_size,). Also, it must be - * on CPU. - * - * @return Return A list-of-list of token IDs containing the decoded results. - * The returned vector has size `batch_size` and each entry contains the - * decoded results for the corresponding input in encoder_out. - */ -std::vector> GreedySearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, torch::Tensor encoder_out_length); - -/** Greedy search for streaming recognition. - * - * @param model The stateless RNN-T Emformer model. - * @param encoder_out A 3-D tensor of shape (N, T, C). It should be on the same - * device as `model`. - * @param decoder_out A 2-D tensor of shape (N, C). It should be on the same - * device as `model`. - * @param hyps The decoded tokens. Note: It is modified in-place. - * @param num_trailing_blank_frames Number of trailing blank frames. It is - * updated in-place. - * - * @return Return the decoder output for the next chunk. - */ -torch::Tensor StreamingGreedySearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, torch::Tensor decoder_out, - std::vector> *hyps, - std::vector *num_trailing_blank_frames); - -/** RNN-T modified beam search for offline recognition. - * - * By modified we mean that the maximum symbol per frame is limited to 1. - * - * @param model The RNN-T model. - * @param encoder_out Output from the encoder network. Its shape is - * (batch_size, T, encoder_out_dim) and its dtype is - * torch::kFloat. It should be on the same device as `model`. - * - * @param encoder_out_lens A 1-D tensor containing the valid frames before - * padding in `encoder_out`. Its dtype is torch.kLong - * and its shape is (batch_size,). Also, it must be - * on CPU. - * - * @param num_active_paths Number of active paths for each utterance. - * Note: Due to merging paths with identical token - * sequences, the actual number of active paths for - * each utterance may be smaller than this value. - * - * @return Return A list-of-list of token IDs containing the decoded results. - * The returned vector has size `batch_size` and each entry contains the - * decoded results for the corresponding input in encoder_out. - */ -std::vector> ModifiedBeamSearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, torch::Tensor encoder_out_length, - int32_t num_active_paths = 4); - -/** Modified beam search for streaming recognition. - * - * @param model The stateless RNN-T Emformer model. - * @param encoder_out A 3-D tensor of shape (N, T, C). It should be on the same - * device as `model`. - * @param hyps The decoded results from the previous chunk. - * @param num_active_paths Number of active paths for each utterance. - * Note: Due to merging paths with identical token - * sequences, the actual number of active paths for - * each utterance may be smaller than this value. - * - * @return Return the decoded results for the next chunk. - */ -std::vector StreamingModifiedBeamSearch( - RnntModel &model, // NOLINT - torch::Tensor encoder_out, std::vector hyps, - int32_t num_active_paths = 4); - -} // namespace sherpa - -#endif // SHERPA_CSRC_RNNT_BEAM_SEARCH_H_ diff --git a/sherpa/csrc/rnnt_conformer_model.cc b/sherpa/csrc/rnnt_conformer_model.cc deleted file mode 100644 index f4e01e504..000000000 --- a/sherpa/csrc/rnnt_conformer_model.cc +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "sherpa/csrc/rnnt_conformer_model.h" - -#include - -namespace sherpa { - -RnntConformerModel::RnntConformerModel(const std::string &filename, - torch::Device device /*=torch::kCPU*/, - bool optimize_for_inference /*=false*/) - : device_(device) { - model_ = torch::jit::load(filename, device); - model_.eval(); -#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ - (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR >= 10) - // torch::jit::optimize_for_inference is available only in torch>=1.10 - if (optimize_for_inference) { - model_ = torch::jit::optimize_for_inference(model_); - } -#endif - - encoder_ = model_.attr("encoder").toModule(); - decoder_ = model_.attr("decoder").toModule(); - joiner_ = model_.attr("joiner").toModule(); - - encoder_proj_ = joiner_.attr("encoder_proj").toModule(); - decoder_proj_ = joiner_.attr("decoder_proj").toModule(); - - blank_id_ = decoder_.attr("blank_id").toInt(); - vocab_size_ = decoder_.attr("vocab_size").toInt(); - - unk_id_ = blank_id_; - if (decoder_.hasattr("unk_id")) { - unk_id_ = decoder_.attr("unk_id").toInt(); - } - - context_size_ = decoder_.attr("context_size").toInt(); -} - -std::pair RnntConformerModel::ForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length) { - auto outputs = model_.attr("encoder") - .toModule() - .run_method("forward", features, features_length) - .toTuple(); - - auto encoder_out = outputs->elements()[0].toTensor(); - auto encoder_out_length = outputs->elements()[1].toTensor(); - - return {encoder_out, encoder_out_length}; -} - -RnntConformerModel::State RnntConformerModel::GetEncoderInitStates( - int32_t left_context) { - torch::IValue ivalue = - encoder_.run_method("get_init_state", left_context, device_); - torch::List list = ivalue.toList(); - - RnntConformerModel::State states = {list.get(0).toTensor(), - list.get(1).toTensor()}; - return states; -} - -std::tuple -RnntConformerModel::StreamingForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length, - const RnntConformerModel::State &states, - const torch::Tensor &processed_frames, int32_t left_context, - int32_t right_context) { - auto outputs = - encoder_ - .run_method("streaming_forward", features, features_length, states, - processed_frames, left_context, right_context) - .toTuple(); - auto encoder_out = outputs->elements()[0].toTensor(); - auto encoder_out_length = outputs->elements()[1].toTensor(); - - torch::List list = outputs->elements()[2].toList(); - - RnntConformerModel::State next_states = {list.get(0).toTensor(), - list.get(1).toTensor()}; - - return {encoder_out, encoder_out_length, next_states}; -} - -torch::Tensor RnntConformerModel::ForwardDecoder( - const torch::Tensor &decoder_input) { - return decoder_.run_method("forward", decoder_input, /*need_pad*/ false) - .toTensor(); -} - -torch::Tensor RnntConformerModel::ForwardJoiner( - const torch::Tensor &projected_encoder_out, - const torch::Tensor &projected_decoder_out) { - return joiner_ - .run_method("forward", projected_encoder_out, projected_decoder_out, - /*project_input*/ false) - .toTensor(); -} - -torch::Tensor RnntConformerModel::ForwardEncoderProj( - const torch::Tensor &encoder_out) { - return encoder_proj_.run_method("forward", encoder_out).toTensor(); -} - -torch::Tensor RnntConformerModel::ForwardDecoderProj( - const torch::Tensor &decoder_out) { - return decoder_proj_.run_method("forward", decoder_out).toTensor(); -} - -} // namespace sherpa diff --git a/sherpa/csrc/rnnt_conformer_model.h b/sherpa/csrc/rnnt_conformer_model.h deleted file mode 100644 index af36fb4e2..000000000 --- a/sherpa/csrc/rnnt_conformer_model.h +++ /dev/null @@ -1,151 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CSRC_RNNT_CONFORMER_MODEL_H_ -#define SHERPA_CSRC_RNNT_CONFORMER_MODEL_H_ - -#include -#include -#include -#include - -#include "sherpa/csrc/rnnt_model.h" -#include "torch/script.h" - -namespace sherpa { - -/** It wraps a torch script model, which is from - * pruned_transducer_stateless2/model.py or - * pruned_transducer_stateless4/model.py within icefall. - */ -class RnntConformerModel : public RnntModel { - public: - /** - * @param filename Path name of the torch script model. - * @param device The model will be moved to this device - * @param optimize_for_inference true to invoke - * torch::jit::optimize_for_inference(). - */ - explicit RnntConformerModel(const std::string &filename, - torch::Device device = torch::kCPU, - bool optimize_for_inference = false); - - ~RnntConformerModel() override = default; - - using State = std::vector; - - State GetEncoderInitStates(int32_t left_context); - - torch::Device Device() const override { return device_; } - - int32_t BlankId() const override { return blank_id_; } - int32_t UnkId() const override { return unk_id_; } - int32_t ContextSize() const override { return context_size_; } - int32_t VocabSize() const override { return vocab_size_; } - // Hard code the subsampling_factor to 4 here since the subsampling - // method uses ((len - 1) // 2 - 1) // 2) - int32_t SubSamplingFactor() const { return 4; } - - /** Run the encoder network. - * - * @param features A 3-D tensor of shape (N, T, C). - * @param features_length A 1-D tensor of shape (N,) containing the number of - * valid frames in `features`. - * @return Return a pair containing two tensors: - * - encoder_out, a 3-D tensor of shape (N, T, C) - * - encoder_out_length, a 1-D tensor of shape (N,) containing the - * number of valid frames in `encoder_out`. - */ - std::pair ForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length); - - /** Run the encoder network in streaming mode. - * - * @param features A 3-D tensor of shape (N, T, C). - * @param features_length A 1-D tensor of shape (N,) containing the number of - * valid frames in `features`. - * @param states A list of tensors containing the decode caches of previous - * frames. It is almost transparent to users, initially this - * comes from the return value of `GetEncoderInitStates`, then it - * will be updated after finishing each chunk. - * @param processed_lengths How many frames have processed until now. - * @param left_context How many previous frames can be seen for current - * chunk. - * @param right_context How many future frames can be seen for current - * chunk. - * @return Return a pair containing two tensors: - * - encoder_out, a 3-D tensor of shape (N, T, C) - * - encoder_out_length, a 1-D tensor of shape (N,) containing the - * number of valid frames in `encoder_out`. - */ - std::tuple StreamingForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length, - const State &states, const torch::Tensor &processed_frames, - int32_t left_context, int32_t right_context); - - /** Run the decoder network. - * - * @param decoder_input A 2-D tensor of shape (N, U). - * @return Return a tensor of shape (N, U, decoder_dim) - */ - torch::Tensor ForwardDecoder(const torch::Tensor &decoder_input) override; - - /** Run the joiner network. - * - * @param projected_encoder_out A 3-D tensor of shape (N, T, C). - * @param projected_decoder_out A 3-D tensor of shape (N, U, C). - * @return Return a tensor of shape (N, T, U, vocab_size) - */ - torch::Tensor ForwardJoiner( - const torch::Tensor &projected_encoder_out, - const torch::Tensor &projected_decoder_out) override; - - /** Run the joiner.encoder_proj network. - * - * @param encoder_out The output from the encoder, which is of shape (N,T,C). - * @return Return a tensor of shape (N, T, joiner_dim). - */ - torch::Tensor ForwardEncoderProj(const torch::Tensor &encoder_out) override; - - /** Run the joiner.decoder_proj network. - * - * @param decoder_out The output from the encoder, which is of shape (N,T,C). - * @return Return a tensor of shape (N, T, joiner_dim). - */ - torch::Tensor ForwardDecoderProj(const torch::Tensor &decoder_out) override; - - private: - torch::jit::Module model_; - - // The following modules are just aliases to modules in model_ - torch::jit::Module encoder_; - torch::jit::Module decoder_; - torch::jit::Module joiner_; - torch::jit::Module encoder_proj_; - torch::jit::Module decoder_proj_; - - torch::Device device_; - int32_t blank_id_; - int32_t unk_id_; - int32_t context_size_; - int32_t vocab_size_; -}; - -} // namespace sherpa - -#endif // SHERPA_CSRC_RNNT_CONFORMER_MODEL_H_ diff --git a/sherpa/csrc/rnnt_conv_emformer_model.cc b/sherpa/csrc/rnnt_conv_emformer_model.cc deleted file mode 100644 index 3a9eef9f9..000000000 --- a/sherpa/csrc/rnnt_conv_emformer_model.cc +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "sherpa/csrc/rnnt_conv_emformer_model.h" - -#include - -namespace sherpa { - -RnntConvEmformerModel::RnntConvEmformerModel( - const std::string &filename, torch::Device device /*=torch::kCPU*/, - bool optimize_for_inference /*=false*/) - : device_(device) { - model_ = torch::jit::load(filename, device); - model_.eval(); - -#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ - (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR >= 10) - // torch::jit::optimize_for_inference is available only in torch>=1.10 - if (optimize_for_inference) { - model_ = torch::jit::optimize_for_inference(model_); - } -#endif - - encoder_ = model_.attr("encoder").toModule(); - decoder_ = model_.attr("decoder").toModule(); - joiner_ = model_.attr("joiner").toModule(); - - encoder_proj_ = joiner_.attr("encoder_proj").toModule(); - decoder_proj_ = joiner_.attr("decoder_proj").toModule(); - - blank_id_ = decoder_.attr("blank_id").toInt(); - vocab_size_ = decoder_.attr("vocab_size").toInt(); - - unk_id_ = blank_id_; - if (decoder_.hasattr("unk_id")) { - unk_id_ = decoder_.attr("unk_id").toInt(); - } - - context_size_ = decoder_.attr("context_size").toInt(); - chunk_length_ = encoder_.attr("chunk_length").toInt(); - right_context_length_ = encoder_.attr("right_context_length").toInt(); - // Add 2 here since we will drop the first and last frame after subsampling; - // Add 3 here since the subsampling is ((len - 1) // 2 - 1) // 2. - pad_length_ = right_context_length_ + - 2 * encoder_.attr("subsampling_factor").toInt() + 3; -} - -std::tuple -RnntConvEmformerModel::StreamingForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length, - const torch::Tensor &num_processed_frames, State states) { - // It contains [torch.Tensor, torch.Tensor, List[List[torch.Tensor]] - // which are [encoder_out, encoder_out_len, states] - // - // We skip the second entry `encoder_out_len` since we assume the - // feature input are of fixed chunk size and there are no paddings. - // We can figure out `encoder_out_len` from `encoder_out`. - auto states_tuple = torch::ivalue::Tuple::create(states.first, states.second); - torch::IValue ivalue = encoder_.run_method( - "infer", features, features_length, num_processed_frames, states_tuple); - auto tuple_ptr = ivalue.toTuple(); - torch::Tensor encoder_out = tuple_ptr->elements()[0].toTensor(); - - torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); - - auto tuple_ptr_states = tuple_ptr->elements()[2].toTuple(); - torch::List list_attn = - tuple_ptr_states->elements()[0].toList(); - torch::List list_conv = - tuple_ptr_states->elements()[1].toList(); - - int32_t num_layers = list_attn.size(); - - std::vector> next_state_attn; - next_state_attn.reserve(num_layers); - for (int32_t i = 0; i != num_layers; ++i) { - next_state_attn.emplace_back( - c10::impl::toTypedList(list_attn.get(i).toList()).vec()); - } - - std::vector next_state_conv; - next_state_conv.reserve(num_layers); - for (int32_t i = 0; i != num_layers; ++i) { - next_state_conv.emplace_back(list_conv.get(i).toTensor()); - } - - State next_states = {next_state_attn, next_state_conv}; - - return {encoder_out, encoder_out_length, next_states}; -} - -RnntConvEmformerModel::State RnntConvEmformerModel::GetEncoderInitStates() { - torch::IValue ivalue = encoder_.run_method("init_states", device_); - auto tuple_ptr = ivalue.toTuple(); - torch::List list_attn = tuple_ptr->elements()[0].toList(); - torch::List list_conv = tuple_ptr->elements()[1].toList(); - - int32_t num_layers = list_attn.size(); - - std::vector> state_attn; - state_attn.reserve(num_layers); - for (int32_t i = 0; i != num_layers; ++i) { - state_attn.emplace_back( - c10::impl::toTypedList(list_attn.get(i).toList()).vec()); - } - - std::vector state_conv; - state_conv.reserve(num_layers); - for (int32_t i = 0; i != num_layers; ++i) { - state_conv.emplace_back(list_conv.get(i).toTensor()); - } - - return {state_attn, state_conv}; -} - -torch::Tensor RnntConvEmformerModel::ForwardDecoder( - const torch::Tensor &decoder_input) { - return decoder_.run_method("forward", decoder_input, /*need_pad*/ false) - .toTensor(); -} - -torch::Tensor RnntConvEmformerModel::ForwardJoiner( - const torch::Tensor &projected_encoder_out, - const torch::Tensor &projected_decoder_out) { - return joiner_ - .run_method("forward", projected_encoder_out, projected_decoder_out, - /*project_input*/ false) - .toTensor(); -} - -torch::Tensor RnntConvEmformerModel::ForwardEncoderProj( - const torch::Tensor &encoder_out) { - return encoder_proj_.run_method("forward", encoder_out).toTensor(); -} - -torch::Tensor RnntConvEmformerModel::ForwardDecoderProj( - const torch::Tensor &decoder_out) { - return decoder_proj_.run_method("forward", decoder_out).toTensor(); -} -} // namespace sherpa diff --git a/sherpa/csrc/rnnt_conv_emformer_model.h b/sherpa/csrc/rnnt_conv_emformer_model.h deleted file mode 100644 index db2191502..000000000 --- a/sherpa/csrc/rnnt_conv_emformer_model.h +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CSRC_RNNT_CONV_EMFORMER_MODEL_H_ -#define SHERPA_CSRC_RNNT_CONV_EMFORMER_MODEL_H_ - -#include -#include -#include -#include - -#include "sherpa/csrc/rnnt_model.h" -#include "torch/script.h" - -namespace sherpa { - -/** It wraps a torch script model, which is from - * pruned_stateless_emformer_rnnt2/model.py within icefall. - */ -class RnntConvEmformerModel : public RnntModel { - public: - /** - * @param filename Path name of the torch script model. - * @param device The model will be moved to this device - * @param optimize_for_inference true to invoke - * torch::jit::optimize_for_inference(). - */ - explicit RnntConvEmformerModel(const std::string &filename, - torch::Device device = torch::kCPU, - bool optimize_for_inference = false); - - ~RnntConvEmformerModel() override = default; - - using State = std::pair>, - std::vector>; - - std::tuple StreamingForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length, - const torch::Tensor &num_processed_frames, State states); - - State GetEncoderInitStates(); - - /** Run the decoder network. - * - * @param decoder_input A 2-D tensor of shape (N, U). - * @return Return a tensor of shape (N, U, decoder_dim) - */ - torch::Tensor ForwardDecoder(const torch::Tensor &decoder_input) override; - - /** Run the joiner network. - * - * @param encoder_out A 2-D tensor of shape (N, C). - * @param decoder_out A 2-D tensor of shape (N, C). - * @return Return a tensor of shape (N, vocab_size) - */ - torch::Tensor ForwardJoiner( - const torch::Tensor &projected_encoder_out, - const torch::Tensor &projected_decoder_out) override; - /** Run the joiner.encoder_proj network. - * - * @param encoder_out The output from the encoder, which is of shape (N,T,C). - * @return Return a tensor of shape (N, T, joiner_dim). - */ - torch::Tensor ForwardEncoderProj(const torch::Tensor &encoder_out) override; - - /** Run the joiner.decoder_proj network. - * - * @param decoder_out The output from the encoder, which is of shape (N,T,C). - * @return Return a tensor of shape (N, T, joiner_dim). - */ - torch::Tensor ForwardDecoderProj(const torch::Tensor &decoder_out) override; - - torch::Device Device() const override { return device_; } - int32_t BlankId() const override { return blank_id_; } - int32_t UnkId() const override { return unk_id_; } - int32_t ContextSize() const override { return context_size_; } - int32_t VocabSize() const override { return vocab_size_; } - int32_t ChunkLength() const { return chunk_length_; } - int32_t RightContextLength() const { return right_context_length_; } - int32_t PadLength() const { return pad_length_; } - - private: - torch::jit::Module model_; - - // The following modules are just aliases to modules in model_ - torch::jit::Module encoder_; - torch::jit::Module decoder_; - torch::jit::Module joiner_; - torch::jit::Module encoder_proj_; - torch::jit::Module decoder_proj_; - - torch::Device device_; - int32_t blank_id_; - int32_t vocab_size_; - int32_t unk_id_; - int32_t context_size_; - int32_t chunk_length_; - int32_t right_context_length_; - int32_t pad_length_; -}; - -} // namespace sherpa - -#endif // SHERPA_CSRC_RNNT_CONV_EMFORMER_MODEL_H_ diff --git a/sherpa/csrc/rnnt_emformer_model.cc b/sherpa/csrc/rnnt_emformer_model.cc deleted file mode 100644 index d2df20907..000000000 --- a/sherpa/csrc/rnnt_emformer_model.cc +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "sherpa/csrc/rnnt_emformer_model.h" - -#include - -namespace sherpa { - -RnntEmformerModel::RnntEmformerModel(const std::string &filename, - torch::Device device /*=torch::kCPU*/, - bool optimize_for_inference /*=false*/) - : device_(device) { - model_ = torch::jit::load(filename, device); - model_.eval(); - -#if SHERPA_TORCH_VERSION_MAJOR > 1 || \ - (SHERPA_TORCH_VERSION_MAJOR == 1 && SHERPA_TORCH_VERSION_MINOR >= 10) - // torch::jit::optimize_for_inference is available only in torch>=1.10 - if (optimize_for_inference) { - model_ = torch::jit::optimize_for_inference(model_); - } -#endif - - encoder_ = model_.attr("encoder").toModule(); - decoder_ = model_.attr("decoder").toModule(); - joiner_ = model_.attr("joiner").toModule(); - - blank_id_ = decoder_.attr("blank_id").toInt(); - vocab_size_ = decoder_.attr("vocab_size").toInt(); - - unk_id_ = blank_id_; - if (decoder_.hasattr("unk_id")) { - unk_id_ = decoder_.attr("unk_id").toInt(); - } - - context_size_ = decoder_.attr("context_size").toInt(); - segment_length_ = encoder_.attr("segment_length").toInt(); - right_context_length_ = encoder_.attr("right_context_length").toInt(); -} - -std::tuple -RnntEmformerModel::StreamingForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length, - torch::optional states /*= torch::nullopt*/) { - // It contains [torch.Tensor, torch.Tensor, List[List[torch.Tensor]] - // which are [encoder_out, encoder_out_len, states] - // - // We skip the second entry `encoder_out_len` since we assume the - // feature input are of fixed chunk size and there are no paddings. - // We can figure out `encoder_out_len` from `encoder_out`. - torch::IValue ivalue = encoder_.run_method("streaming_forward", features, - features_length, states); - auto tuple_ptr = ivalue.toTuple(); - torch::Tensor encoder_out = tuple_ptr->elements()[0].toTensor(); - torch::Tensor encoder_out_length = tuple_ptr->elements()[1].toTensor(); - - torch::List list = tuple_ptr->elements()[2].toList(); - int32_t num_layers = list.size(); - - std::vector> next_states; - next_states.reserve(num_layers); - - for (int32_t i = 0; i != num_layers; ++i) { - next_states.emplace_back( - c10::impl::toTypedList(list.get(i).toList()).vec()); - } - - return {encoder_out, encoder_out_length, next_states}; -} - -RnntEmformerModel::State RnntEmformerModel::GetEncoderInitStates() { - torch::IValue ivalue = encoder_.run_method("get_init_state", device_); - torch::List list = ivalue.toList(); - int32_t num_layers = list.size(); - State states; - states.reserve(num_layers); - for (int32_t i = 0; i != num_layers; ++i) { - states.emplace_back( - c10::impl::toTypedList(list.get(i).toList()).vec()); - } - return states; -} - -torch::Tensor RnntEmformerModel::ForwardDecoder( - const torch::Tensor &decoder_input) { - return decoder_.run_method("forward", decoder_input, /*need_pad*/ false) - .toTensor(); -} - -torch::Tensor RnntEmformerModel::ForwardJoiner( - const torch::Tensor &encoder_out, const torch::Tensor &decoder_out) { - return joiner_.run_method("forward", encoder_out, decoder_out).toTensor(); -} - -} // namespace sherpa diff --git a/sherpa/csrc/rnnt_emformer_model.h b/sherpa/csrc/rnnt_emformer_model.h deleted file mode 100644 index ebf096083..000000000 --- a/sherpa/csrc/rnnt_emformer_model.h +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CSRC_RNNT_EMFORMER_MODEL_H_ -#define SHERPA_CSRC_RNNT_EMFORMER_MODEL_H_ - -#include -#include -#include -#include - -#include "sherpa/csrc/rnnt_model.h" -#include "torch/script.h" - -namespace sherpa { - -/** It wraps a torch script model, which is from - * pruned_stateless_emformer_rnnt2/model.py within icefall. - */ -class RnntEmformerModel : public RnntModel { - public: - /** - * @param filename Path name of the torch script model. - * @param device The model will be moved to this device - * @param optimize_for_inference true to invoke - * torch::jit::optimize_for_inference(). - */ - explicit RnntEmformerModel(const std::string &filename, - torch::Device device = torch::kCPU, - bool optimize_for_inference = false); - - ~RnntEmformerModel() override = default; - - using State = std::vector>; - - std::tuple StreamingForwardEncoder( - const torch::Tensor &features, const torch::Tensor &features_length, - torch::optional states = torch::nullopt); - - State GetEncoderInitStates(); - - /** Run the decoder network. - * - * @param decoder_input A 2-D tensor of shape (N, U). - * @return Return a tensor of shape (N, U, decoder_dim) - */ - torch::Tensor ForwardDecoder(const torch::Tensor &decoder_input) override; - - /** Run the joiner network. - * - * @param encoder_out A 2-D tensor of shape (N, C). - * @param decoder_out A 2-D tensor of shape (N, C). - * @return Return a tensor of shape (N, vocab_size) - */ - torch::Tensor ForwardJoiner(const torch::Tensor &encoder_out, - const torch::Tensor &decoder_out) override; - - torch::Device Device() const override { return device_; } - int32_t BlankId() const override { return blank_id_; } - int32_t UnkId() const override { return unk_id_; } - int32_t ContextSize() const override { return context_size_; } - int32_t VocabSize() const override { return vocab_size_; } - int32_t SegmentLength() const { return segment_length_; } - int32_t RightContextLength() const { return right_context_length_; } - - private: - torch::jit::Module model_; - - // The following modules are just aliases to modules in model_ - torch::jit::Module encoder_; - torch::jit::Module decoder_; - torch::jit::Module joiner_; - - torch::Device device_; - int32_t blank_id_; - int32_t unk_id_; - int32_t vocab_size_; - int32_t context_size_; - int32_t segment_length_; - int32_t right_context_length_; -}; - -} // namespace sherpa - -#endif // SHERPA_CSRC_RNNT_EMFORMER_MODEL_H_ diff --git a/sherpa/csrc/rnnt_model.h b/sherpa/csrc/rnnt_model.h deleted file mode 100644 index 659ec1ec7..000000000 --- a/sherpa/csrc/rnnt_model.h +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_CSRC_RNNT_MODEL_H_ -#define SHERPA_CSRC_RNNT_MODEL_H_ - -#include -#include - -#include "torch/script.h" - -namespace sherpa { - -/** The base class of stateless transducer model, it has an encoder, decoder - * and joiner, and the decoder is stateless. - * See the code in pruned_transducer_statelessX/model.py in icefall - * repo for for more details. - */ - -class RnntModel { - public: - virtual ~RnntModel() = default; - - virtual torch::Device Device() const = 0; - - virtual int32_t BlankId() const = 0; - virtual int32_t UnkId() const = 0; - virtual int32_t ContextSize() const = 0; - virtual int32_t VocabSize() const = 0; - - int32_t SubsamplingFactor() const { return 4; } - - /** Run the decoder network. - * - * @param decoder_input A 2-D tensor of shape (N, U). - * @return Return a tensor of shape (N, U, decoder_dim) - */ - virtual torch::Tensor ForwardDecoder(const torch::Tensor &decoder_input) = 0; - - /** Run the joiner network. - * - * @param projected_encoder_out A 2-D tensor of shape (N, C). - * @param projected_decoder_out A 2-D tensor of shape (N, C). - * @return Return a tensor of shape (N, vocab_size) - */ - virtual torch::Tensor ForwardJoiner( - const torch::Tensor &projected_encoder_out, - const torch::Tensor &projected_decoder_out) = 0; - - /** Run the joiner.encoder_proj network. - * - * @param encoder_out The output from the encoder, which is of shape (N,T,C). - * @return Return a tensor of shape (N, T, joiner_dim). - */ - virtual torch::Tensor ForwardEncoderProj(const torch::Tensor &encoder_out) { - return encoder_out; - } - - /** Run the joiner.decoder_proj network. - * - * @param decoder_out The output from the encoder, which is of shape (N,T,C). - * @return Return a tensor of shape (N, T, joiner_dim). - */ - virtual torch::Tensor ForwardDecoderProj(const torch::Tensor &decoder_out) { - return decoder_out; - } -}; - -} // namespace sherpa - -#endif // SHERPA_CSRC_RNNT_MODEL_H_ diff --git a/sherpa/csrc/sherpa-compute-speaker-similarity.cc b/sherpa/csrc/sherpa-compute-speaker-similarity.cc new file mode 100644 index 000000000..4a0f35bc4 --- /dev/null +++ b/sherpa/csrc/sherpa-compute-speaker-similarity.cc @@ -0,0 +1,93 @@ +// sherpa/csrc/sherpa-compute-speaker-similarity.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include // NOLINT +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/fbank-features.h" +#include "sherpa/csrc/speaker-embedding-extractor.h" + +int32_t main(int32_t argc, char *argv[]) { + const char *kUsageMessage = R"usage( +This program uses a speaker embedding model to compute +similarity between two wave files. + +sherpa-compute-speaker-similarity \ + --model=/path/to/model.pt \ + ./foo.wav \ + ./bar.wav \ +)usage"; + + int32_t num_threads = 1; + sherpa::ParseOptions po(kUsageMessage); + sherpa::SpeakerEmbeddingExtractorConfig config; + config.Register(&po); + po.Register("num-threads", &num_threads, "Number of threads for PyTorch"); + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + std::cerr << "Please provide only 2 test waves\n"; + exit(-1); + } + + std::cerr << config.ToString() << "\n"; + if (!config.Validate()) { + std::cerr << "Please check your config\n"; + return -1; + } + + int32_t sr = 16000; + sherpa::SpeakerEmbeddingExtractor extractor(config); + + const auto begin = std::chrono::steady_clock::now(); + + torch::Tensor samples1 = sherpa::ReadWave(po.GetArg(1), sr).first; + + auto stream1 = extractor.CreateStream(); + stream1->AcceptSamples(samples1.data_ptr(), samples1.numel()); + + torch::Tensor samples2 = sherpa::ReadWave(po.GetArg(2), sr).first; + + auto stream2 = extractor.CreateStream(); + stream2->AcceptSamples(samples2.data_ptr(), samples2.numel()); + + torch::Tensor embedding1; + torch::Tensor embedding2; + if (false) { + embedding1 = extractor.Compute(stream1.get()).squeeze(0); + embedding2 = extractor.Compute(stream2.get()).squeeze(0); + } else { + std::vector ss{stream1.get(), stream2.get()}; + auto embeddings = extractor.Compute(ss.data(), ss.size()); + + embedding1 = embeddings.index({0}); + embedding2 = embeddings.index({1}); + } + + auto score = + torch::nn::functional::cosine_similarity( + embedding1, embedding2, + torch::nn::functional::CosineSimilarityFuncOptions{}.dim(0).eps(1e-6)) + .item() + .toFloat(); + + const auto end = std::chrono::steady_clock::now(); + + const float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = (samples1.size(0) + samples2.size(0)) / 16000.0f; + const float rtf = elapsed_seconds / duration; + + std::cout << "score: " << score << "\n"; + + fprintf(stderr, "Elapsed seconds: %.3f\n", elapsed_seconds); + fprintf(stderr, "Audio duration: %.3f s\n", duration); + fprintf(stderr, "Real time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, + duration, rtf); + + return 0; +} diff --git a/sherpa/csrc/sherpa-vad.cc b/sherpa/csrc/sherpa-vad.cc new file mode 100644 index 000000000..38b963f9d --- /dev/null +++ b/sherpa/csrc/sherpa-vad.cc @@ -0,0 +1,70 @@ +// sherpa/csrc/sherpa-vad.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include // NOLINT +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/fbank-features.h" +#include "sherpa/csrc/voice-activity-detector.h" +#include "torch/torch.h" + +int32_t main(int32_t argc, char *argv[]) { + const char *kUsageMessage = R"usage( +This program uses a VAD models to add timestamps to a audio file +Usage: + +sherpa-vad \ + --silero-vad-model=/path/to/model.pt \ + --vad-use-gpu=false \ + --num-threads=1 \ + ./foo.wav + +)usage"; + + int32_t num_threads = 1; + sherpa::ParseOptions po(kUsageMessage); + sherpa::VoiceActivityDetectorConfig config; + config.Register(&po); + po.Register("num-threads", &num_threads, "Number of threads for PyTorch"); + po.Read(argc, argv); + + if (po.NumArgs() != 1) { + std::cerr << "Please provide only 1 test wave\n"; + exit(-1); + } + + std::cerr << config.ToString() << "\n"; + config.Validate(); + + torch::set_num_threads(num_threads); + torch::set_num_interop_threads(num_threads); + + sherpa::VoiceActivityDetector vad(config); + + torch::Tensor samples = sherpa::ReadWave(po.GetArg(1), 16000).first; + + const auto begin = std::chrono::steady_clock::now(); + + auto segments = vad.Process(samples); + + const auto end = std::chrono::steady_clock::now(); + + const float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = samples.size(0) / 16000.0f; + + const float rtf = elapsed_seconds / duration; + for (const auto &s : segments) { + fprintf(stderr, "%.3f -- %.3f\n", s.start, s.end); + } + + fprintf(stderr, "Number of threads: %d\n", num_threads); + fprintf(stderr, "Elapsed seconds: %.3f\n", elapsed_seconds); + fprintf(stderr, "Audio duration: %.3f s\n", duration); + fprintf(stderr, "Real time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, + duration, rtf); +} diff --git a/sherpa/csrc/silero-vad-model-config.cc b/sherpa/csrc/silero-vad-model-config.cc new file mode 100644 index 000000000..93c7c0840 --- /dev/null +++ b/sherpa/csrc/silero-vad-model-config.cc @@ -0,0 +1,92 @@ +// sherpa/csrc/silero-vad-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/silero-vad-model-config.h" + +#include + +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/macros.h" + +namespace sherpa { + +void SileroVadModelConfig::Register(ParseOptions *po) { + po->Register("silero-vad-model", &model, "Path to silero VAD model."); + + po->Register("silero-vad-threshold", &threshold, + "Speech threshold. Silero VAD outputs speech probabilities for " + "each audio chunk, probabilities ABOVE this value are " + "considered as SPEECH. It is better to tune this parameter for " + "each dataset separately, but lazy " + "0.5 is pretty good for most datasets."); + + po->Register( + "silero-vad-min-silence-duration", &min_silence_duration, + "In seconds. In the end of each speech chunk wait for " + "--silero-vad-min-silence-duration seconds before separating it"); + + po->Register("silero-vad-min-speech-duration", &min_speech_duration, + "In seconds. In the end of each silence chunk wait for " + "--silero-vad-min-speech-duration seconds before separating it"); +} + +bool SileroVadModelConfig::Validate() const { + if (model.empty()) { + SHERPA_LOGE("Please provide --silero-vad-model"); + return false; + } + + if (!FileExists(model)) { + SHERPA_LOGE("Silero vad model file '%s' does not exist", model.c_str()); + return false; + } + + if (threshold < 0.01) { + SHERPA_LOGE( + "Please use a larger value for --silero-vad-threshold. Given: %f", + threshold); + return false; + } + + if (threshold >= 1) { + SHERPA_LOGE( + "Please use a smaller value for --silero-vad-threshold. Given: %f", + threshold); + return false; + } + + if (min_silence_duration <= 0) { + SHERPA_LOGE( + "Please use a larger value for --silero-vad-min-silence-duration. " + "Given: " + "%f", + min_silence_duration); + return false; + } + + if (min_speech_duration <= 0) { + SHERPA_LOGE( + "Please use a larger value for --silero-vad-min-speech-duration. " + "Given: " + "%f", + min_speech_duration); + return false; + } + + return true; +} + +std::string SileroVadModelConfig::ToString() const { + std::ostringstream os; + + os << "SileroVadModelConfig("; + os << "model=\"" << model << "\", "; + os << "threshold=" << threshold << ", "; + os << "min_silence_duration=" << min_silence_duration << ", "; + os << "min_speech_duration=" << min_speech_duration << ")"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/silero-vad-model-config.h b/sherpa/csrc/silero-vad-model-config.h new file mode 100644 index 000000000..3217e2b2d --- /dev/null +++ b/sherpa/csrc/silero-vad-model-config.h @@ -0,0 +1,43 @@ +// sherpa/csrc/silero-vad-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_SILERO_VAD_MODEL_CONFIG_H_ +#define SHERPA_CSRC_SILERO_VAD_MODEL_CONFIG_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" + +namespace sherpa { + +struct SileroVadModelConfig { + std::string model; + + // threshold to classify a segment as speech + // + // If the predicted probability of a segment is larger than this + // value, then it is classified as speech. + float threshold = 0.5; + + float min_silence_duration = 0.5; // in seconds + + float min_speech_duration = 0.25; // in seconds + + SileroVadModelConfig() = default; + SileroVadModelConfig(const std::string &model, float threshold, + float min_silence_duration, float min_speech_duration) + : model(model), + threshold(threshold), + min_silence_duration(min_silence_duration), + min_speech_duration(min_speech_duration) {} + + void Register(ParseOptions *po); + + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_SILERO_VAD_MODEL_CONFIG_H_ diff --git a/sherpa/csrc/silero-vad-model.cc b/sherpa/csrc/silero-vad-model.cc new file mode 100644 index 000000000..0aaa8c3c6 --- /dev/null +++ b/sherpa/csrc/silero-vad-model.cc @@ -0,0 +1,67 @@ +// sherpa/csrc/silero-vad-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/csrc/silero-vad-model.h" + +#include "sherpa/csrc/macros.h" +namespace sherpa { + +class SileroVadModel::Impl { + public: + explicit Impl(const VadModelConfig &config) : config_(config) { + torch::jit::ExtraFilesMap meta_data{ + {"version", {}}, + {"model_type", {}}, + }; + + if (config.use_gpu) { + device_ = torch::Device{torch::kCUDA}; + } + + model_ = torch::jit::load(config.silero_vad.model, device_, meta_data); + + model_.eval(); + + if (meta_data.at("model_type") != "silero_vad") { + SHERPA_LOGE("Expect model_type 'silero_vad'. Given: '%s'\n", + meta_data.at("model_type").c_str()); + SHERPA_EXIT(-1); + } + + if (meta_data.at("version") != "4") { + SHERPA_LOGE("It supports only silero_vad v4. Given version: '%s'\n", + meta_data.at("version").c_str()); + SHERPA_EXIT(-1); + } + } + + torch::Device Device() const { return device_; } + + torch::Tensor Run(torch::Tensor samples) { + torch::Tensor sample_rate = torch::tensor( + {config_.sample_rate}, torch::dtype(torch::kInt).device(device_)); + + int32_t window_size = 512; + return model_ + .run_method("audio_forward", samples, config_.sample_rate, window_size) + .toTensor(); + } + + private: + torch::jit::Module model_; + torch::Device device_{torch::kCPU}; + VadModelConfig config_; +}; + +SileroVadModel::SileroVadModel(const VadModelConfig &config) + : impl_(std::make_unique(config)) {} + +SileroVadModel::~SileroVadModel() = default; + +torch::Device SileroVadModel::Device() const { return impl_->Device(); } + +torch::Tensor SileroVadModel::Run(torch::Tensor samples) const { + return impl_->Run(samples); +} + +} // namespace sherpa diff --git a/sherpa/csrc/silero-vad-model.h b/sherpa/csrc/silero-vad-model.h new file mode 100644 index 000000000..ce835516b --- /dev/null +++ b/sherpa/csrc/silero-vad-model.h @@ -0,0 +1,35 @@ +// sherpa/csrc/silero-vad-model.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_SILERO_VAD_MODEL_H_ +#define SHERPA_CSRC_SILERO_VAD_MODEL_H_ + +#include + +#include "sherpa/csrc/vad-model-config.h" +#include "torch/script.h" + +namespace sherpa { + +class SileroVadModel { + public: + explicit SileroVadModel(const VadModelConfig &config); + + ~SileroVadModel(); + + torch::Device Device() const; + + /** + * @param samples A 2-D tensor of shape (batch_size, num_samples) + * @returns Return A 3-D tensor of shape (batch_size, num_frames) + */ + torch::Tensor Run(torch::Tensor samples) const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_SILERO_VAD_MODEL_H_ diff --git a/sherpa/csrc/speaker-embedding-extractor-general-impl.h b/sherpa/csrc/speaker-embedding-extractor-general-impl.h new file mode 100644 index 000000000..7e63a7ead --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor-general-impl.h @@ -0,0 +1,98 @@ +// sherpa/csrc/speaker-embedding-extractor-general-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_GENERAL_IMPL_H_ +#define SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_GENERAL_IMPL_H_ +#include +#include +#include +#include + +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/macros.h" +#include "sherpa/cpp_api/offline-stream.h" +#include "sherpa/csrc/speaker-embedding-extractor-impl.h" +#include "sherpa/csrc/speaker-embedding-extractor-model.h" + +namespace sherpa { + +class SpeakerEmbeddingExtractorGeneralImpl + : public SpeakerEmbeddingExtractorImpl { + public: + explicit SpeakerEmbeddingExtractorGeneralImpl( + const SpeakerEmbeddingExtractorConfig &config) + : model_(config) { + // TODO(fangjun): make it configurable + feat_config_.fbank_opts.frame_opts.dither = 0; + feat_config_.fbank_opts.frame_opts.snip_edges = true; + feat_config_.fbank_opts.frame_opts.samp_freq = 16000; + feat_config_.fbank_opts.mel_opts.num_bins = 80; + feat_config_.normalize_samples = true; + + fbank_ = std::make_unique(feat_config_.fbank_opts); + + WarmUp(); + } + + int32_t Dim() const override { return model_.GetModelMetadata().output_dim; } + + std::unique_ptr CreateStream() const override { + return std::make_unique(fbank_.get(), feat_config_); + } + + torch::Tensor Compute(OfflineStream *s) const override { + InferenceMode no_grad; + auto features = s->GetFeatures(); + features -= features.mean(0, true); + features = features.unsqueeze(0); + auto device = model_.Device(); + return model_.Compute(features.to(device)); + } + + torch::Tensor Compute(OfflineStream **ss, int32_t n) const override { + InferenceMode no_grad; + if (n == 1) { + return Compute(ss[0]); + } + + std::vector features_vec(n); + for (int32_t i = 0; i != n; ++i) { + auto f = ss[i]->GetFeatures(); + f -= f.mean(0, true); + features_vec[i] = f; + } + + auto device = model_.Device(); + + auto features = + torch::nn::utils::rnn::pad_sequence(features_vec, true, 0).to(device); + + return model_.Compute(features); + } + + private: + void WarmUp() { + InferenceMode no_grad; + SHERPA_LOG(INFO) << "WarmUp begins"; + auto s = CreateStream(); + float sample_rate = fbank_->GetFrameOptions().samp_freq; + std::vector samples(2 * sample_rate, 0); + s->AcceptSamples(samples.data(), samples.size()); + + auto embedding = Compute(s.get()); + + model_.GetModelMetadata().output_dim = embedding.size(1); + + SHERPA_LOG(INFO) << "WarmUp ended"; + } + + private: + SpeakerEmbeddingExtractorModel model_; + std::unique_ptr fbank_; + FeatureConfig feat_config_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_GENERAL_IMPL_H_ diff --git a/sherpa/csrc/speaker-embedding-extractor-impl.cc b/sherpa/csrc/speaker-embedding-extractor-impl.cc new file mode 100644 index 000000000..05405da9f --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor-impl.cc @@ -0,0 +1,17 @@ +// sherpa/csrc/speaker-embedding-extractor-impl.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/csrc/speaker-embedding-extractor-impl.h" + +#include "sherpa/csrc/speaker-embedding-extractor-general-impl.h" + +namespace sherpa { + +std::unique_ptr +SpeakerEmbeddingExtractorImpl::Create( + const SpeakerEmbeddingExtractorConfig &config) { + // supports only 3-d speaker for now + return std::make_unique(config); +} + +} // namespace sherpa diff --git a/sherpa/csrc/speaker-embedding-extractor-impl.h b/sherpa/csrc/speaker-embedding-extractor-impl.h new file mode 100644 index 000000000..adc285ee4 --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor-impl.h @@ -0,0 +1,34 @@ +// sherpa/csrc/speaker-embedding-extractor-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_IMPL_H_ +#define SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_IMPL_H_ + +#include +#include +#include + +#include "sherpa/csrc/speaker-embedding-extractor.h" + +namespace sherpa { + +class SpeakerEmbeddingExtractorImpl { + public: + virtual ~SpeakerEmbeddingExtractorImpl() = default; + + static std::unique_ptr Create( + const SpeakerEmbeddingExtractorConfig &config); + + virtual int32_t Dim() const = 0; + + virtual std::unique_ptr CreateStream() const = 0; + + virtual torch::Tensor Compute(OfflineStream *s) const = 0; + + virtual torch::Tensor Compute(OfflineStream **s, int32_t n) const = 0; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_IMPL_H_ diff --git a/sherpa/csrc/speaker-embedding-extractor-model-meta-data.h b/sherpa/csrc/speaker-embedding-extractor-model-meta-data.h new file mode 100644 index 000000000..962bb68cb --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor-model-meta-data.h @@ -0,0 +1,28 @@ +// sherpa/csrc/speaker-embedding-extractor-model-meta-data.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_META_DATA_H_ +#define SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_META_DATA_H_ + +#include +#include + +namespace sherpa { + +struct SpeakerEmbeddingExtractorModelMetaData { + int32_t output_dim = 0; + int32_t sample_rate = 0; + + // for wespeaker models, it is 0; + // for 3d-speaker models, it is 1 + int32_t normalize_samples = 1; + + // Chinese, English, etc. + std::string language; + + // for 3d-speaker, it is global-mean + std::string feature_normalize_type; +}; + +} // namespace sherpa +#endif // SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_META_DATA_H_ diff --git a/sherpa/csrc/speaker-embedding-extractor-model.cc b/sherpa/csrc/speaker-embedding-extractor-model.cc new file mode 100644 index 000000000..0793dd5e4 --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor-model.cc @@ -0,0 +1,87 @@ +// sherpa/csrc/speaker-embedding-extractor-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/speaker-embedding-extractor-model.h" + +#include +#include +#include + +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/speaker-embedding-extractor-model-meta-data.h" + +namespace sherpa { + +class SpeakerEmbeddingExtractorModel::Impl { + public: + explicit Impl(const SpeakerEmbeddingExtractorConfig &config) + : config_(config) { + torch::jit::ExtraFilesMap meta_data{ + {"version", {}}, + {"model_type", {}}, + }; + + if (config.use_gpu) { + device_ = torch::Device{torch::kCUDA}; + } + + model_ = torch::jit::load(config.model, device_, meta_data); + + model_.eval(); + + if (meta_data.at("model_type") != "3d-speaker") { + SHERPA_LOGE("Expect model_type '3d-speaker'. Given: '%s'\n", + meta_data.at("model_type").c_str()); + SHERPA_EXIT(-1); + } + } + + torch::Tensor Compute(torch::Tensor x) { + return model_.run_method("forward", x).toTensor(); + } + + SpeakerEmbeddingExtractorModelMetaData &GetModelMetadata() { + return meta_data_; + } + + const SpeakerEmbeddingExtractorModelMetaData &GetModelMetadata() const { + return meta_data_; + } + + torch::Device Device() const { return device_; } + + private: + SpeakerEmbeddingExtractorConfig config_; + + torch::jit::Module model_; + torch::Device device_{torch::kCPU}; + + SpeakerEmbeddingExtractorModelMetaData meta_data_; +}; + +SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel( + const SpeakerEmbeddingExtractorConfig &config) + : impl_(std::make_unique(config)) {} + +SpeakerEmbeddingExtractorModel::~SpeakerEmbeddingExtractorModel() = default; + +SpeakerEmbeddingExtractorModelMetaData & +SpeakerEmbeddingExtractorModel::GetModelMetadata() { + return impl_->GetModelMetadata(); +} + +torch::Tensor SpeakerEmbeddingExtractorModel::Compute(torch::Tensor x) const { + return impl_->Compute(x); +} + +const SpeakerEmbeddingExtractorModelMetaData & +SpeakerEmbeddingExtractorModel::GetModelMetadata() const { + return impl_->GetModelMetadata(); +} + +torch::Device SpeakerEmbeddingExtractorModel::Device() const { + return impl_->Device(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/speaker-embedding-extractor-model.h b/sherpa/csrc/speaker-embedding-extractor-model.h new file mode 100644 index 000000000..3efd30fc5 --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor-model.h @@ -0,0 +1,40 @@ +// sherpa/csrc/speaker-embedding-extractor-model.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_H_ +#define SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_H_ + +#include + +#include "sherpa/csrc/speaker-embedding-extractor-model-meta-data.h" +#include "sherpa/csrc/speaker-embedding-extractor.h" +#include "torch/script.h" + +namespace sherpa { + +class SpeakerEmbeddingExtractorModel { + public: + explicit SpeakerEmbeddingExtractorModel( + const SpeakerEmbeddingExtractorConfig &config); + + ~SpeakerEmbeddingExtractorModel(); + + SpeakerEmbeddingExtractorModelMetaData &GetModelMetadata(); + const SpeakerEmbeddingExtractorModelMetaData &GetModelMetadata() const; + + /** + * @param x A float32 tensor of shape (N, T, C) + * @return A float32 tensor of shape (N, C) + */ + torch::Tensor Compute(torch::Tensor x) const; + + torch::Device Device() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_MODEL_H_ diff --git a/sherpa/csrc/speaker-embedding-extractor.cc b/sherpa/csrc/speaker-embedding-extractor.cc new file mode 100644 index 000000000..9df3b66d6 --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor.cc @@ -0,0 +1,73 @@ +// sherpa/csrc/speaker-embedding-extractor.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/speaker-embedding-extractor.h" + +#include + +#include "sherpa/cpp_api/macros.h" +#include "sherpa/csrc/file-utils.h" +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/speaker-embedding-extractor-impl.h" + +namespace sherpa { + +void SpeakerEmbeddingExtractorConfig::Register(ParseOptions *po) { + po->Register("model", &model, "Path to the speaker embedding model."); + po->Register("debug", &debug, + "true to print model information while loading it."); + + po->Register("use_gpu", &use_gpu, "true to gpu."); +} + +bool SpeakerEmbeddingExtractorConfig::Validate() const { + if (model.empty()) { + SHERPA_LOGE("Please provide a speaker embedding extractor model"); + return false; + } + + if (!FileExists(model)) { + SHERPA_LOGE("speaker embedding extractor model: '%s' does not exist", + model.c_str()); + return false; + } + + return true; +} + +std::string SpeakerEmbeddingExtractorConfig::ToString() const { + std::ostringstream os; + + os << "SpeakerEmbeddingExtractorConfig("; + os << "model=\"" << model << "\", "; + os << "debug=" << (debug ? "True" : "False") << ", "; + os << "use_gpu=" << (use_gpu ? "True" : "False") << ")"; + + return os.str(); +} + +SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor( + const SpeakerEmbeddingExtractorConfig &config) + : impl_(SpeakerEmbeddingExtractorImpl::Create(config)) {} + +SpeakerEmbeddingExtractor::~SpeakerEmbeddingExtractor() = default; + +int32_t SpeakerEmbeddingExtractor::Dim() const { return impl_->Dim(); } + +std::unique_ptr SpeakerEmbeddingExtractor::CreateStream() const { + return impl_->CreateStream(); +} + +torch::Tensor SpeakerEmbeddingExtractor::Compute(OfflineStream *s) const { + InferenceMode no_grad; + return impl_->Compute(s); +} + +torch::Tensor SpeakerEmbeddingExtractor::Compute(OfflineStream **ss, + int32_t n) const { + InferenceMode no_grad; + return impl_->Compute(ss, n); +} + +} // namespace sherpa diff --git a/sherpa/csrc/speaker-embedding-extractor.h b/sherpa/csrc/speaker-embedding-extractor.h new file mode 100644 index 000000000..cf9d6f691 --- /dev/null +++ b/sherpa/csrc/speaker-embedding-extractor.h @@ -0,0 +1,65 @@ +// sherpa/csrc/speaker-embedding-extractor.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ +#define SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ + +#include +#include +#include + +#include "sherpa/cpp_api/offline-stream.h" +#include "sherpa/cpp_api/parse-options.h" + +namespace sherpa { + +struct SpeakerEmbeddingExtractorConfig { + std::string model; + bool use_gpu = false; + bool debug = false; + + SpeakerEmbeddingExtractorConfig() = default; + SpeakerEmbeddingExtractorConfig(const std::string &model, bool use_gpu, + bool debug) + : model(model), use_gpu(use_gpu), debug(debug) {} + + void Register(ParseOptions *po); + bool Validate() const; + std::string ToString() const; +}; + +class SpeakerEmbeddingExtractorImpl; + +class SpeakerEmbeddingExtractor { + public: + explicit SpeakerEmbeddingExtractor( + const SpeakerEmbeddingExtractorConfig &config); + + template + SpeakerEmbeddingExtractor(Manager *mgr, + const SpeakerEmbeddingExtractorConfig &config); + + ~SpeakerEmbeddingExtractor(); + + // Return the dimension of the embedding + int32_t Dim() const; + + // Create a stream to accept audio samples and compute features + std::unique_ptr CreateStream() const; + + // Compute the speaker embedding from the available unprocessed features + // of the given stream + // + // You have to ensure IsReady(s) returns true before you call this method. + torch::Tensor Compute(OfflineStream *s) const; + + torch::Tensor Compute(OfflineStream **ss, int32_t n) const; + + private: + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ diff --git a/sherpa/csrc/symbol-table.cc b/sherpa/csrc/symbol-table.cc new file mode 100644 index 000000000..c8d885288 --- /dev/null +++ b/sherpa/csrc/symbol-table.cc @@ -0,0 +1,96 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sherpa/csrc/symbol-table.h" + +#include +#include + +#include "sherpa/csrc/base64-decode.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +SymbolTable::SymbolTable(const std::string &filename) { + std::ifstream is(filename); + std::string sym; + int32_t id; + while (is >> sym >> id) { + if (sym.size() >= 3) { + // For BPE-based models, we replace ▁ with a space + // Unicode 9601, hex 0x2581, utf8 0xe29681 + const uint8_t *p = reinterpret_cast(sym.c_str()); + if (p[0] == 0xe2 && p[1] == 0x96 && p[2] == 0x81) { + sym = sym.replace(0, 3, " "); + } + } + + SHERPA_CHECK(!sym.empty()); + SHERPA_CHECK_EQ(sym2id_.count(sym), 0) << "Duplicated symbol: " << sym; + SHERPA_CHECK_EQ(id2sym_.count(id), 0) << "Duplicated ID: " << id; + + sym2id_.insert({sym, id}); + id2sym_.insert({id, sym}); + } + SHERPA_CHECK(is.eof()); +} + +std::string SymbolTable::ToString() const { + std::ostringstream os; + char sep = ' '; + for (const auto &p : sym2id_) { + os << p.first << sep << p.second << "\n"; + } + return os.str(); +} + +const std::string &SymbolTable::operator[](int32_t id) const { + return id2sym_.at(id); +} + +int32_t SymbolTable::operator[](const std::string &sym) const { + return sym2id_.at(sym); +} + +bool SymbolTable::Contains(int32_t id) const { return id2sym_.count(id) != 0; } + +bool SymbolTable::Contains(const std::string &sym) const { + return sym2id_.count(sym) != 0; +} + +std::ostream &operator<<(std::ostream &os, const SymbolTable &symbol_table) { + return os << symbol_table.ToString(); +} + +void SymbolTable::Replace(int32_t id, const std::string &new_sym, + const std::string &old_sym) { + sym2id_.erase(old_sym); + + id2sym_.at(id) = new_sym; + sym2id_[new_sym] = id; +} + +void SymbolTable::ApplyBase64Decode() { + sym2id_.clear(); + for (auto &p : id2sym_) { + p.second = Base64Decode(p.second); + sym2id_[p.second] = p.first; + } +} + +} // namespace sherpa diff --git a/sherpa/csrc/symbol-table.h b/sherpa/csrc/symbol-table.h new file mode 100644 index 000000000..484ddf5d3 --- /dev/null +++ b/sherpa/csrc/symbol-table.h @@ -0,0 +1,69 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SHERPA_CSRC_SYMBOL_TABLE_H_ +#define SHERPA_CSRC_SYMBOL_TABLE_H_ + +#include +#include + +namespace sherpa { + +/// It manages mapping between symbols and integer IDs. +class SymbolTable { + public: + SymbolTable() = default; + /// Construct a symbol table from a file. + /// Each line in the file contains two fields: + /// + /// sym ID + /// + /// Fields are separated by space(s). + explicit SymbolTable(const std::string &filename); + + /// Return a string representation of this symbol table + std::string ToString() const; + + /// Return the symbol corresponding to the given ID. + const std::string &operator[](int32_t id) const; + /// Return the ID corresponding to the given symbol. + int32_t operator[](const std::string &sym) const; + + // self[id] = sym + void Replace(int32_t id, const std::string &new_sym, + const std::string &old_sym); + + /// Return true if there is a symbol with the given ID. + bool Contains(int32_t id) const; + + /// Return true if there is a given symbol in the symbol table. + bool Contains(const std::string &sym) const; + + // for tokens.txt from Whisper + void ApplyBase64Decode(); + + private: + std::unordered_map sym2id_; + std::unordered_map id2sym_; +}; + +std::ostream &operator<<(std::ostream &os, const SymbolTable &symbol_table); + +} // namespace sherpa + +#endif // SHERPA_CSRC_SYMBOL_TABLE_H_ diff --git a/sherpa/csrc/test-byte-util.cc b/sherpa/csrc/test-byte-util.cc new file mode 100644 index 000000000..4458278eb --- /dev/null +++ b/sherpa/csrc/test-byte-util.cc @@ -0,0 +1,60 @@ +/** + * Copyright 2023 Xiaomi Corporation (authors: Wei Kang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "gtest/gtest.h" +#include "sherpa/csrc/byte_util.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +TEST(ByteUtil, TestBasic) { + auto bu = GetByteUtil(); + std::string str = "Hello world"; + SHERPA_CHECK_EQ(bu->Decode(bu->Encode(str)), str); + + str = "世界人民大团结万岁"; + SHERPA_CHECK_EQ(bu->Decode(bu->Encode(str)), str); + + str = "美国 America vs China 中国 123 go!!!"; + SHERPA_CHECK_EQ(bu->Decode(bu->Encode(str)), str); +} + +TEST(ByteUtil, TestInvalidBytes) { + auto bu = GetByteUtil(); + std::string str = "ƍĩĴƎĩŗƋţŅƋ⁇Şœƌľţ"; + SHERPA_CHECK_EQ(bu->Decode(str), "我爱你中国"); + + str = "ƍĩĴĩŗƋţŅƋŞœƌľţ"; // drop one byte in 爱 + SHERPA_CHECK_EQ(bu->Decode(str), "我你中国"); + + str = "ƍĩƎĩŗƋţŅƋŞœƌľţ"; // drop one byte in 我 + SHERPA_CHECK_EQ(bu->Decode(str), "爱你中国"); + + str = "ƍĩĴƎĩŗƋţŅƋŞœƌţ"; // drop one byte in 国 + SHERPA_CHECK_EQ(bu->Decode(str), "我爱你中"); + + str = "ƍĩĴƎĩŗƋţŅƋœƌľ"; // drop one byte in 中 and 国 + SHERPA_CHECK_EQ(bu->Decode(str), "我爱你"); + + str = "ƍĩĴƎĩŗƋţŅƋlœƌoľve"; // replace one byte in 中 and 国 with l o + SHERPA_CHECK_EQ(bu->Decode(str), "我爱你love"); +} + +} // namespace sherpa diff --git a/sherpa/csrc/test-context-graph.cc b/sherpa/csrc/test-context-graph.cc new file mode 100644 index 000000000..00b18903e --- /dev/null +++ b/sherpa/csrc/test-context-graph.cc @@ -0,0 +1,57 @@ +/** + * Copyright 2023 Xiaomi Corporation (authors: Wei Kang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "sherpa/csrc/context-graph.h" + +namespace sherpa { + +TEST(ContextGraph, TestBasic) { + std::vector contexts_str( + {"S", "HE", "SHE", "SHELL", "HIS", "HERS", "HELLO", "THIS", "THEM"}); + std::vector> contexts; + for (size_t i = 0; i < contexts_str.size(); ++i) { + contexts.emplace_back(contexts_str[i].begin(), contexts_str[i].end()); + } + auto context_graph = ContextGraph(contexts, 1); + + auto queries = std::map{ + {"HEHERSHE", 14}, {"HERSHE", 12}, {"HISHE", 9}, + {"SHED", 6}, {"SHELF", 6}, {"HELL", 2}, + {"HELLO", 7}, {"DHRHISQ", 4}, {"THEN", 2}}; + + for (const auto &iter : queries) { + float total_scores = 0; + auto state = context_graph.Root(); + for (auto q : iter.first) { + auto res = context_graph.ForwardOneStep(state, q); + total_scores += res.first; + state = res.second; + } + auto res = context_graph.Finalize(state); + EXPECT_EQ(res.second->token, -1); + total_scores += res.first; + EXPECT_EQ(total_scores, iter.second); + } +} + +} // namespace sherpa diff --git a/sherpa/csrc/test-data/test-offline-conformer-transducer-model.py b/sherpa/csrc/test-data/test-offline-conformer-transducer-model.py new file mode 100755 index 000000000..9546fbb35 --- /dev/null +++ b/sherpa/csrc/test-data/test-offline-conformer-transducer-model.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022 Xiaomi Corporation +# flake8: noqa + +""" +This file generates test data for ../test-offline-conformer-transducer-model.cc + +Usage: + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +git lfs pull --include "exp/cpu_jit.pt" +cd .. + +mkdir build +cd build +make -j test-offline-conformer-transducer-model + +python3 ../sherpa/csrc/test-data/test-offline-conformer-transducer-model.py \ + --nn-model ../icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt + +./bin/test-offline-conformer-transducer-model \ + ../icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/cpu_jit.pt \ + ./test-data.pt +""" + +import argparse +from pathlib import Path + +import torch + +# You can download the model from +# https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/blob/main/exp/cpu_jit.pt + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--nn-model", + type=str, + required=True, + help="Path to the torchscript model", + ) + return parser.parse_args() + + +def main(): + args = get_args() + filename = Path(args.nn_model) + if not filename.is_file(): + raise ValueError(f"{filename} does not exist") + model = torch.jit.load(filename) + features = torch.rand(2, 20, 80) + features_length = torch.tensor([12, 20], dtype=torch.int64) + + encoder_out, encoder_out_length = model.encoder(features, features_length) + encoder_out = model.joiner.encoder_proj(encoder_out) + + decoder_input = torch.tensor([[1, 5], [3, 9]], dtype=torch.int64) + decoder_out = model.decoder(decoder_input, need_pad=False) + decoder_out = model.joiner.decoder_proj(decoder_out) + + joiner_out = model.joiner( + encoder_out[:, 0:1, :].unsqueeze(1), + decoder_out.unsqueeze(1), + project_input=False, + ) + + print(encoder_out.shape) # (2, 4, 512) + print(encoder_out_length) # [2, 4] + print(decoder_out.shape) # (2, 1, 512) + print(joiner_out.shape) # (2, 1, 1, 500) + data = { + "features": features, + "features_length": features_length, + "encoder_out": encoder_out, + "encoder_out_length": encoder_out_length, + "decoder_input": decoder_input, + "decoder_out": decoder_out, + "joiner_out": joiner_out, + } + torch.save(data, "test-data.pt") + + +if __name__ == "__main__": + torch.manual_seed(20221106) + main() diff --git a/sherpa/csrc/test-data/test-online-conv-emformer-transducer-model.py b/sherpa/csrc/test-data/test-online-conv-emformer-transducer-model.py new file mode 100755 index 000000000..4dc6bf530 --- /dev/null +++ b/sherpa/csrc/test-data/test-online-conv-emformer-transducer-model.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022 Xiaomi Corporation +# flake8: noqa + +""" +This file generates test data for ../test-online-conv-emformer-transducer-model.cc + +Usage: + +cd /path/to/sherpa + +GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 +cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05 +git lfs pull --include "exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt" +cd .. + +mkdir build +cd build +make -j test-online-conv-emformer-transducer-model + +python3 ../sherpa/csrc/test-data/test-online-conv-emformer-transducer-model.py \ + --nn-model ../icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt + +./bin/test-online-conv-emformer-transducer-model \ + ../icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt + ./test-data.pt +""" + +import argparse +from pathlib import Path +from typing import List, Tuple + +import torch + + +# copied from +# https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer.py#L92 +def stack_states( + state_list: List[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]] +) -> Tuple[List[List[torch.Tensor]], List[torch.Tensor]]: + """Stack list of emformer states that correspond to separate utterances + into a single emformer state so that it can be used as an input for + emformer when those utterances are formed into a batch. + + Note: + It is the inverse of :func:`unstack_states`. + + Args: + state_list: + Each element in state_list corresponding to the internal state + of the emformer model for a single utterance. + ``states[i]`` is a tuple of 2 elements of i-th utterance. + ``states[i][0]`` is the attention caches of i-th utterance. + ``states[i][1]`` is the convolution caches of i-th utterance. + ``len(states[i][0])`` and ``len(states[i][1])`` both eqaul to number of layers. # noqa + + Returns: + A new state corresponding to a batch of utterances. + See the input argument of :func:`unstack_states` for the meaning + of the returned tensor. + """ + batch_size = len(state_list) + + attn_caches = [] + for layer in state_list[0][0]: + if batch_size > 1: + # Note: We will stack attn_caches[layer][s][] later to get attn_caches[layer][s] # noqa + attn_caches.append([[s] for s in layer]) + else: + attn_caches.append([s.unsqueeze(1) for s in layer]) + for b, states in enumerate(state_list[1:], 1): + for li, layer in enumerate(states[0]): + for si, s in enumerate(layer): + attn_caches[li][si].append(s) + if b == batch_size - 1: + attn_caches[li][si] = torch.stack( + attn_caches[li][si], dim=1 + ) + + conv_caches = [] + for layer in state_list[0][1]: + if batch_size > 1: + # Note: We will stack conv_caches[layer][] later to get conv_caches[layer] # noqa + conv_caches.append([layer]) + else: + conv_caches.append(layer.unsqueeze(0)) + for b, states in enumerate(state_list[1:], 1): + for li, layer in enumerate(states[1]): + conv_caches[li].append(layer) + if b == batch_size - 1: + conv_caches[li] = torch.stack(conv_caches[li], dim=0) + + return [attn_caches, conv_caches] + + +# You can download the model from +# https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--nn-model", + type=str, + required=True, + help="Path to the torchscript model", + ) + return parser.parse_args() + + +def main(): + args = get_args() + filename = Path(args.nn_model) + if not filename.is_file(): + raise ValueError(f"{filename} does not exist") + model = torch.jit.load(filename) + chunk_length = model.encoder.chunk_length + right_context_length = model.encoder.right_context_length + + pad_length = right_context_length + 2 * 4 + 3 + + print("chunk_length:", chunk_length) # 32 + print("right_context_length:", right_context_length) # 8 + print("pad_length:", pad_length) # 19 + + chunk_size = chunk_length + pad_length + chunk_shift = chunk_length + + features = torch.rand(2, chunk_size, 80) + features_length = torch.tensor([chunk_size, chunk_size], dtype=torch.int64) + init_state = model.encoder.init_states() + init_states = stack_states([init_state, init_state]) + + num_processed_frames = torch.tensor([0, 0], dtype=torch.int32) + + _, _, state = model.encoder.infer( + features, features_length, num_processed_frames, init_states + ) + num_processed_frames += chunk_shift + encoder_out, encoder_out_length, next_state = model.encoder.infer( + features, features_length, num_processed_frames, state + ) + encoder_out = model.joiner.encoder_proj(encoder_out) + + decoder_input = torch.tensor([[1, 5], [3, 9]], dtype=torch.int64) + decoder_out = model.decoder(decoder_input, need_pad=False) + decoder_out = model.joiner.decoder_proj(decoder_out) + + joiner_out = model.joiner( + encoder_out[:, 0:1, :].unsqueeze(1), + decoder_out.unsqueeze(1), + project_input=False, + ) + + print(encoder_out.shape) # (2, 4, 512) + print(encoder_out_length) # [2, 4] + print(decoder_out.shape) # (2, 1, 512) + print(joiner_out.shape) # (2, 1, 1, 500) + data = { + "features": features, + "features_length": features_length, + "encoder_out": encoder_out, + "encoder_out_length": encoder_out_length, + "decoder_input": decoder_input, + "decoder_out": decoder_out, + "joiner_out": joiner_out, + "state": state, + "next_state": next_state, + "num_processed_frames": num_processed_frames, + "chunk_size": chunk_size, + "chunk_shift": chunk_shift, + } + torch.save(data, "test-data.pt") + + +if __name__ == "__main__": + torch.manual_seed(20221107) + main() diff --git a/sherpa/csrc/test_hypothesis.cc b/sherpa/csrc/test-hypothesis.cc similarity index 100% rename from sherpa/csrc/test_hypothesis.cc rename to sherpa/csrc/test-hypothesis.cc diff --git a/sherpa/csrc/test_log.cc b/sherpa/csrc/test-log.cc similarity index 100% rename from sherpa/csrc/test_log.cc rename to sherpa/csrc/test-log.cc diff --git a/sherpa/csrc/test-offline-conformer-transducer-model.cc b/sherpa/csrc/test-offline-conformer-transducer-model.cc new file mode 100644 index 000000000..5559346d7 --- /dev/null +++ b/sherpa/csrc/test-offline-conformer-transducer-model.cc @@ -0,0 +1,82 @@ +// sherpa/csrc/test-offline-conformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/offline-conformer-transducer-model.h" +#include "torch/csrc/jit/serialization/pickle.h" +#include "torch/script.h" + +// see https://github.com/pytorch/pytorch/issues/20356#issuecomment-1061667333 +static std::vector ReadData(const std::string &filename) { + std::ifstream is(filename, std::ios::binary); + std::vector ans((std::istreambuf_iterator(is)), + (std::istreambuf_iterator())); + + return ans; +} + +static void AssertAllClose(torch::Tensor a, torch::Tensor b) { + if (!torch::allclose(a, b)) { + std::cerr << "Failed! max abs: " << (a - b).abs().max().item() + << "\n"; + exit(-1); + } +} + +// Please see ./test-data/test-offline-conformer-transducer-model.py +// for how to generate the test data. +int main(int argc, char *argv[]) { + if (argc != 3) { + fprintf(stderr, + "Usage: ./bin/test-offline-conformer-transducer-model cpu_jit.pt " + "test_data.pt\n"); + return -1; + } + + std::string nn_model = argv[1]; + torch::IValue ivalue = torch::jit::pickle_load(ReadData(argv[2])); + + if (!ivalue.isGenericDict()) { + fprintf(stderr, "Expect a dict.\n"); + return -1; + } + + auto model = + std::make_unique(nn_model); + + torch::Dict dict = ivalue.toGenericDict(); + + torch::Tensor features = dict.at("features").toTensor(); + torch::Tensor features_length = dict.at("features_length").toTensor(); + + torch::Tensor encoder_out = dict.at("encoder_out").toTensor(); + torch::Tensor encoder_out_length = dict.at("encoder_out_length").toTensor(); + + torch::Tensor decoder_input = dict.at("decoder_input").toTensor(); + torch::Tensor decoder_out = dict.at("decoder_out").toTensor(); + + torch::Tensor joiner_out = dict.at("joiner_out").toTensor(); + + torch::Tensor encoder_out2; + torch::Tensor encoder_out2_length; + std::tie(encoder_out2, encoder_out2_length) = + model->RunEncoder(features, features_length); + + AssertAllClose(encoder_out, encoder_out2); + AssertAllClose(encoder_out_length, encoder_out2_length); + + torch::Tensor decoder_out2 = model->RunDecoder(decoder_input); + AssertAllClose(decoder_out, decoder_out2); + + // see https://pytorch.org/cppdocs/notes/tensor_indexing.html + using namespace torch::indexing; // NOLINT + + torch::Tensor joiner_out2 = model->RunJoiner( + encoder_out2.index({Slice(), Slice(0, 1), Slice()}).unsqueeze(1), + decoder_out2.unsqueeze(1)); + + AssertAllClose(joiner_out, joiner_out2); + fprintf(stderr, "%s Passed!\n", __FILE__); + + return 0; +} diff --git a/sherpa/csrc/test-online-conv-emformer-transducer-model.cc b/sherpa/csrc/test-online-conv-emformer-transducer-model.cc new file mode 100644 index 000000000..82b42d5cb --- /dev/null +++ b/sherpa/csrc/test-online-conv-emformer-transducer-model.cc @@ -0,0 +1,105 @@ +// sherpa/csrc/test-online-conv-emformer-transducer-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/csrc/online-conv-emformer-transducer-model.h" +#include "sherpa/csrc/online-transducer-model.h" + +// see https://github.com/pytorch/pytorch/issues/20356#issuecomment-1061667333 +static std::vector ReadData(const std::string &filename) { + std::ifstream is(filename, std::ios::binary); + std::vector ans((std::istreambuf_iterator(is)), + (std::istreambuf_iterator())); + + return ans; +} + +static void AssertAllClose(torch::Tensor a, torch::Tensor b) { + if (!torch::allclose(a, b, 1e-5, 1e-5)) { + std::cerr << "Failed! max abs: " << (a - b).abs().max().item() + << "\n"; + exit(-1); + } +} + +static void AssertEqual(int32_t a, int32_t b) { + if (a != b) { + std::cerr << "Failed! a: " << a << " vs b: " << b << "\n"; + exit(-1); + } +} + +// Please see ./test-data/test-online-conv-emformer-transducer-model.py +// for how to generate the test data. +int main(int argc, char *argv[]) { + if (argc != 3) { + fprintf(stderr, + "Usage: ./bin/test-offline-conformer-transducer-model cpu_jit.pt " + "test_data.pt\n"); + return -1; + } + + std::string nn_model = argv[1]; + torch::IValue ivalue = torch::jit::pickle_load(ReadData(argv[2])); + + if (!ivalue.isGenericDict()) { + fprintf(stderr, "Expect a dict.\n"); + return -1; + } + + auto model = + std::make_unique(nn_model); + + torch::Dict dict = ivalue.toGenericDict(); + + torch::Tensor features = dict.at("features").toTensor(); + torch::Tensor features_length = dict.at("features_length").toTensor(); + + torch::Tensor encoder_out = dict.at("encoder_out").toTensor(); + torch::Tensor encoder_out_length = dict.at("encoder_out_length").toTensor(); + + torch::Tensor decoder_input = dict.at("decoder_input").toTensor(); + torch::Tensor decoder_out = dict.at("decoder_out").toTensor(); + + torch::Tensor joiner_out = dict.at("joiner_out").toTensor(); + int32_t chunk_size = dict.at("chunk_size").toInt(); + int32_t chunk_shift = dict.at("chunk_shift").toInt(); + AssertEqual(chunk_size, model->ChunkSize()); + AssertEqual(chunk_shift, model->ChunkShift()); + + torch::Tensor num_processed_frames = torch::zeros({2}, torch::kInt32); + auto s1 = model->GetEncoderInitStates(); + auto s2 = model->GetEncoderInitStates(); + auto s = model->StackStates({s1, s2}); + + torch::Tensor encoder_out2; + torch::Tensor encoder_out2_length; + torch::IValue state; + std::tie(encoder_out2, encoder_out2_length, state) = + model->RunEncoder(features, features_length, num_processed_frames, s); + + num_processed_frames += chunk_shift; + std::tie(encoder_out2, encoder_out2_length, state) = + model->RunEncoder(features, features_length, num_processed_frames, state); + + AssertAllClose(num_processed_frames, + dict.at("num_processed_frames").toTensor()); + + AssertAllClose(encoder_out, encoder_out2); + AssertAllClose(encoder_out_length, encoder_out2_length); + + torch::Tensor decoder_out2 = model->RunDecoder(decoder_input); + AssertAllClose(decoder_out, decoder_out2); + + // see https://pytorch.org/cppdocs/notes/tensor_indexing.html + using namespace torch::indexing; // NOLINT + + torch::Tensor joiner_out2 = model->RunJoiner( + encoder_out2.index({Slice(), Slice(0, 1), Slice()}).unsqueeze(1), + decoder_out2.unsqueeze(1)); + + AssertAllClose(joiner_out, joiner_out2); + fprintf(stderr, "%s Passed!\n", __FILE__); + + return 0; +} diff --git a/sherpa/csrc/test-online-stream.cc b/sherpa/csrc/test-online-stream.cc new file mode 100644 index 000000000..be33736d6 --- /dev/null +++ b/sherpa/csrc/test-online-stream.cc @@ -0,0 +1,53 @@ +/** + * Copyright 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "gtest/gtest.h" +#include "sherpa/cpp_api/feature-config.h" +#include "sherpa/cpp_api/online-stream.h" +#include "sherpa/csrc/log.h" + +namespace sherpa { + +TEST(OnlineStream, Test) { + float sampling_rate = 16000; + int32_t feature_dim = 80; + int32_t max_feature_vectors = 10; + FeatureConfig feat_config; + feat_config.fbank_opts.mel_opts.num_bins = feature_dim; + + OnlineStream s(feat_config); + EXPECT_EQ(s.NumFramesReady(), 0); + auto a = torch::rand({500}, torch::kFloat); + s.AcceptWaveform(sampling_rate, a); + + EXPECT_EQ(s.NumFramesReady(), 1); + auto frame = s.GetFrame(0); + EXPECT_EQ(frame.dim(), 2); + EXPECT_EQ(frame.size(0), 1); + EXPECT_EQ(frame.size(1), feature_dim); + + EXPECT_FALSE(s.IsLastFrame(0)); + s.InputFinished(); + + EXPECT_EQ(s.NumFramesReady(), 1); + EXPECT_TRUE(s.IsLastFrame(0)); +} + +} // namespace sherpa diff --git a/sherpa/csrc/test_parse_options.cc b/sherpa/csrc/test-parse-options.cc similarity index 99% rename from sherpa/csrc/test_parse_options.cc rename to sherpa/csrc/test-parse-options.cc index 120657b4d..c61265a35 100644 --- a/sherpa/csrc/test_parse_options.cc +++ b/sherpa/csrc/test-parse-options.cc @@ -19,7 +19,7 @@ #include #include "gtest/gtest.h" -#include "sherpa/csrc/parse_options.h" +#include "sherpa/cpp_api/parse-options.h" namespace sherpa { diff --git a/sherpa/csrc/text-utils.cc b/sherpa/csrc/text-utils.cc new file mode 100644 index 000000000..dc9f4af14 --- /dev/null +++ b/sherpa/csrc/text-utils.cc @@ -0,0 +1,349 @@ +// sherpa/csrc/text-utils.cc +// +// Copyright 2009-2011 Saarland University; Microsoft Corporation +// Copyright 2023-2025 Xiaomi Corporation + +#include "sherpa/csrc/text-utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sherpa/csrc/macros.h" + +// This file is copied/modified from +// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc + +namespace sherpa { + +// copied from kaldi/src/util/text-util.cc +template +class NumberIstream { + public: + explicit NumberIstream(std::istream &i) : in_(i) {} + + NumberIstream &operator>>(T &x) { + if (!in_.good()) return *this; + in_ >> x; + if (!in_.fail() && RemainderIsOnlySpaces()) return *this; + return ParseOnFail(&x); + } + + private: + std::istream &in_; + + bool RemainderIsOnlySpaces() { + if (in_.tellg() != std::istream::pos_type(-1)) { + std::string rem; + in_ >> rem; + + if (rem.find_first_not_of(' ') != std::string::npos) { + // there is not only spaces + return false; + } + } + + in_.clear(); + return true; + } + + NumberIstream &ParseOnFail(T *x) { + std::string str; + in_.clear(); + in_.seekg(0); + // If the stream is broken even before trying + // to read from it or if there are many tokens, + // it's pointless to try. + if (!(in_ >> str) || !RemainderIsOnlySpaces()) { + in_.setstate(std::ios_base::failbit); + return *this; + } + + std::unordered_map inf_nan_map; + // we'll keep just uppercase values. + inf_nan_map["INF"] = std::numeric_limits::infinity(); + inf_nan_map["+INF"] = std::numeric_limits::infinity(); + inf_nan_map["-INF"] = -std::numeric_limits::infinity(); + inf_nan_map["INFINITY"] = std::numeric_limits::infinity(); + inf_nan_map["+INFINITY"] = std::numeric_limits::infinity(); + inf_nan_map["-INFINITY"] = -std::numeric_limits::infinity(); + inf_nan_map["NAN"] = std::numeric_limits::quiet_NaN(); + inf_nan_map["+NAN"] = std::numeric_limits::quiet_NaN(); + inf_nan_map["-NAN"] = -std::numeric_limits::quiet_NaN(); + // MSVC + inf_nan_map["1.#INF"] = std::numeric_limits::infinity(); + inf_nan_map["-1.#INF"] = -std::numeric_limits::infinity(); + inf_nan_map["1.#QNAN"] = std::numeric_limits::quiet_NaN(); + inf_nan_map["-1.#QNAN"] = -std::numeric_limits::quiet_NaN(); + + std::transform(str.begin(), str.end(), str.begin(), ::toupper); + + if (inf_nan_map.find(str) != inf_nan_map.end()) { + *x = inf_nan_map[str]; + } else { + in_.setstate(std::ios_base::failbit); + } + + return *this; + } +}; + +/// ConvertStringToReal converts a string into either float or double +/// and returns false if there was any kind of problem (i.e. the string +/// was not a floating point number or contained extra non-whitespace junk). +/// Be careful- this function will successfully read inf's or nan's. +template +bool ConvertStringToReal(const std::string &str, T *out) { + std::istringstream iss(str); + + NumberIstream i(iss); + + i >> *out; + + if (iss.fail()) { + // Number conversion failed. + return false; + } + + return true; +} + +template bool ConvertStringToReal(const std::string &str, float *out); + +template bool ConvertStringToReal(const std::string &str, double *out); + +void SplitStringToVector(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out) { + size_t start = 0, found = 0, end = full.size(); + out->clear(); + while (found != std::string::npos) { + found = full.find_first_of(delim, start); + // start != end condition is for when the delimiter is at the end + if (!omit_empty_strings || (found != start && start != end)) + out->push_back(full.substr(start, found - start)); + start = found + 1; + } +} + +template +bool SplitStringToFloats(const std::string &full, const char *delim, + bool omit_empty_strings, // typically false + std::vector *out) { + assert(out != nullptr); + if (*(full.c_str()) == '\0') { + out->clear(); + return true; + } + std::vector split; + SplitStringToVector(full, delim, omit_empty_strings, &split); + out->resize(split.size()); + for (size_t i = 0; i < split.size(); ++i) { + // assume atof never fails + F f = 0; + if (!ConvertStringToReal(split[i], &f)) return false; + (*out)[i] = f; + } + return true; +} + +// Instantiate the template above for float and double. +template bool SplitStringToFloats(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out); +template bool SplitStringToFloats(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out); + +static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } +static bool IsGermanUmlaut(const std::string &word) { + // ä 0xC3 0xA4 + // ö 0xC3 0xB6 + // ü 0xC3 0xBC + // Ä 0xC3 0x84 + // Ö 0xC3 0x96 + // Ü 0xC3 0x9C + // ß 0xC3 0x9F + + if (word.size() != 2 || static_cast(word[0]) != 0xc3) { + return false; + } + + auto c = static_cast(word[1]); + if (c == 0xa4 || c == 0xb6 || c == 0xbc || c == 0x84 || c == 0x96 || + c == 0x9c || c == 0x9f) { + return true; + } + + return false; +} + +// see https://www.tandem.net/blog/spanish-accents +// https://www.compart.com/en/unicode/U+00DC +static bool IsSpanishDiacritic(const std::string &word) { + // á 0xC3 0xA1 + // é 0xC3 0xA9 + // í 0xC3 0xAD + // ó 0xC3 0xB3 + // ú 0xC3 0xBA + // ü 0xC3 0xBC + // ñ 0xC3 0xB1 + // + // uppercase + // + // Á 0xC3 0x81 + // É 0xC3 0x89 + // Í 0xC3 0x8D + // Ó 0xC3 0x93 + // Ú 0xC3 0x9A + // Ü 0xC3 0x9C + // Ñ 0xC3 0x91 + + if (word.size() != 2 || static_cast(word[0]) != 0xc3) { + return false; + } + + auto c = static_cast(word[1]); + if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || + c == 0xbc || c == 0xb1 || c == 0x81 || c == 0x89 || c == 0x8d || + c == 0x93 || c == 0x9a || c == 0x9c || c == 0x91) { + return true; + } + + return false; +} + +// see https://www.busuu.com/en/french/accent-marks +static bool IsFrenchDiacritic(const std::string &word) { + // acute accent + // é 0xC3 0xA9 + // + // grave accent + // à 0xC3 0xA0 + // è 0xC3 0xA8 + // ù 0xC3 0xB9 + // + // cedilla + // ç 0xC3 0xA7 + // + // circumflex + // â 0xC3 0xA2 + // ê 0xC3 0xAA + // î 0xC3 0xAE + // ô 0xC3 0xB4 + // û 0xC3 0xBB + // + // trema + // ë 0xC3 0xAB + // ï 0xC3 0xAF + // ü 0xC3 0xBC + // + // É 0xC3 0x89 + // + // À 0xC3 0x80 + // È 0xC3 0x88 + // Ù 0xC3 0x99 + // Ç 0xC3 0x87 + //  0xC3 0x82 + // Ê 0xC3 0x8A + // Î 0xC3 0x8E + // Ô 0xC3 0x94 + // Û 0xC3 0x9B + // Ë 0xC3 0x8B + // Ï 0xC3 0x8F + // Ü 0xC3 0x9C + + if (word.size() != 2 || static_cast(word[0]) != 0xc3) { + return false; + } + + auto c = static_cast(word[1]); + if (c == 0xa9 || c == 0xa0 || c == 0xa8 || c == 0xb9 || c == 0xa7 || + c == 0xa2 || c == 0xaa || c == 0xae || c == 0xb4 || c == 0xbb || + c == 0xab || c == 0xaf || c == 0xbc || c == 0x89 || c == 0x80 || + c == 0x88 || c == 0x99 || c == 0x87 || c == 0x82 || c == 0x8a || + c == 0x8e || c == 0x94 || c == 0x9b || c == 0x8b || c == 0x8f || + c == 0x9c) { + return true; + } + return false; +} + +static bool IsSpecial(const std::string &w) { + bool ans = IsGermanUmlaut(w) || IsSpanishDiacritic(w) || IsFrenchDiacritic(w); + + // for french d’impossible + // ’ 0xE2 0x80 0x99 + bool ans2 = false; + if (w.size() == 3) { + auto c0 = static_cast(w[0]); + auto c1 = static_cast(w[1]); + auto c2 = static_cast(w[2]); + if (c0 == 0xe2 && c1 == 0x80 && c2 == 0x99) { + ans2 = true; + } + } + + return ans || ans2; +} + +static std::vector MergeCharactersIntoWords( + const std::vector &words) { + std::vector ans; + + int32_t n = static_cast(words.size()); + int32_t i = 0; + int32_t prev = -1; + + while (i < n) { + const auto &w = words[i]; + if (w.size() >= 3 || (w.size() == 2 && !IsSpecial(w)) || + (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { + if (prev != -1) { + std::string t; + for (; prev < i; ++prev) { + t.append(words[prev]); + } + prev = -1; + ans.push_back(std::move(t)); + } + + if (!std::isspace(w[0])) { + ans.push_back(w); + } + ++i; + continue; + } + + // e.g., öffnen + if (w.size() == 1 || (w.size() == 2 && IsSpecial(w))) { + if (prev == -1) { + prev = i; + } + ++i; + continue; + } + + SHERPA_LOGE("Ignore %s", w.c_str()); + ++i; + } + + if (prev != -1) { + std::string t; + for (; prev < i; ++prev) { + t.append(words[prev]); + } + ans.push_back(std::move(t)); + } + + return ans; +} + +} // namespace sherpa diff --git a/sherpa/csrc/text-utils.h b/sherpa/csrc/text-utils.h new file mode 100644 index 000000000..7619df2d6 --- /dev/null +++ b/sherpa/csrc/text-utils.h @@ -0,0 +1,123 @@ +// sherpa/csrc/text-utils.h +// +// Copyright 2009-2011 Saarland University; Microsoft Corporation +// Copyright 2023-2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_TEXT_UTILS_H_ +#define SHERPA_CSRC_TEXT_UTILS_H_ +#include +#include + +#include +#include +#include +#include + +#ifdef _MSC_VER +#define SHERPA_STRTOLL(cur_cstr, end_cstr) _strtoi64(cur_cstr, end_cstr, 10); +#else +#define SHERPA_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10); +#endif + +// This file is copied/modified from +// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.h + +namespace sherpa { + +/// Converts a string into an integer via strtoll and returns false if there was +/// any kind of problem (i.e. the string was not an integer or contained extra +/// non-whitespace junk, or the integer was too large to fit into the type it is +/// being converted into). Only sets *out if everything was OK and it returns +/// true. +template +bool ConvertStringToInteger(const std::string &str, Int *out) { + // copied from kaldi/src/util/text-util.h + static_assert(std::is_integral::value, ""); + const char *this_str = str.c_str(); + char *end = nullptr; + errno = 0; + int64_t i = SHERPA_STRTOLL(this_str, &end); + if (end != this_str) { + while (isspace(*end)) ++end; + } + if (end == this_str || *end != '\0' || errno != 0) return false; + Int iInt = static_cast(i); + if (static_cast(iInt) != i || + (i < 0 && !std::numeric_limits::is_signed)) { + return false; + } + *out = iInt; + return true; +} + +/// Split a string using any of the single character delimiters. +/// If omit_empty_strings == true, the output will contain any +/// nonempty strings after splitting on any of the +/// characters in the delimiter. If omit_empty_strings == false, +/// the output will contain n+1 strings if there are n characters +/// in the set "delim" within the input string. In this case +/// the empty string is split to a single empty string. +void SplitStringToVector(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out); + +/** + \brief Split a string (e.g. 1:2:3) into a vector of integers. + + \param [in] delim String containing a list of characters, any of which + is allowed as a delimiter. + \param [in] omit_empty_strings If true, empty strings between delimiters are + allowed and will not produce an output integer; if false, + instances of characters in 'delim' that are consecutive or + at the start or end of the string would be an error. + You'll normally want this to be true if 'delim' consists + of spaces, and false otherwise. + \param [out] out The output list of integers. +*/ +template +bool SplitStringToIntegers(const std::string &full, const char *delim, + bool omit_empty_strings, // typically false [but + // should probably be true + // if "delim" is spaces]. + std::vector *out) { + static_assert(std::is_integral::value, ""); + if (*(full.c_str()) == '\0') { + out->clear(); + return true; + } + std::vector split; + SplitStringToVector(full, delim, omit_empty_strings, &split); + out->resize(split.size()); + for (size_t i = 0; i < split.size(); i++) { + const char *this_str = split[i].c_str(); + char *end = NULL; + int64_t j = 0; + j = SHERPA_STRTOLL(this_str, &end); + if (end == this_str || *end != '\0') { + out->clear(); + return false; + } else { + I jI = static_cast(j); + if (static_cast(jI) != j) { + // output type cannot fit this integer. + out->clear(); + return false; + } + (*out)[i] = jI; + } + } + return true; +} + +// This is defined for F = float and double. +template +bool SplitStringToFloats(const std::string &full, const char *delim, + bool omit_empty_strings, // typically false + std::vector *out); + +// This is defined for F = float and double. +template +bool ConvertStringToReal(const std::string &str, T *out); + +} // namespace sherpa + +#endif // SHERPA_CSRC_TEXT_UTILS_H_ diff --git a/sherpa/csrc/vad-model-config.cc b/sherpa/csrc/vad-model-config.cc new file mode 100644 index 000000000..60ed555b9 --- /dev/null +++ b/sherpa/csrc/vad-model-config.cc @@ -0,0 +1,38 @@ +// sherpa/csrc/vad-model-config.cc +// +// Copyright (c) 2023 Xiaomi Corporation + +#include "sherpa/csrc/vad-model-config.h" + +#include +#include + +namespace sherpa { + +void VadModelConfig::Register(ParseOptions *po) { + silero_vad.Register(po); + + po->Register("vad-sample-rate", &sample_rate, + "Sample rate expected by the VAD model"); + + po->Register("vad-use-gpu", &use_gpu, "true to use GPU"); + + po->Register("vad-debug", &debug, + "true to display debug information when loading vad models"); +} + +bool VadModelConfig::Validate() const { return silero_vad.Validate(); } + +std::string VadModelConfig::ToString() const { + std::ostringstream os; + + os << "VadModelConfig("; + os << "silero_vad=" << silero_vad.ToString() << ", "; + os << "sample_rate=" << sample_rate << ", "; + os << "use_gpu=\"" << (use_gpu ? "True" : "False") << "\", "; + os << "debug=" << (debug ? "True" : "False") << ")"; + + return os.str(); +} + +} // namespace sherpa diff --git a/sherpa/csrc/vad-model-config.h b/sherpa/csrc/vad-model-config.h new file mode 100644 index 000000000..e5cb4016f --- /dev/null +++ b/sherpa/csrc/vad-model-config.h @@ -0,0 +1,40 @@ +// sherpa/csrc/vad-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_VAD_MODEL_CONFIG_H_ +#define SHERPA_CSRC_VAD_MODEL_CONFIG_H_ + +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/silero-vad-model-config.h" + +namespace sherpa { + +struct VadModelConfig { + SileroVadModelConfig silero_vad; + + int32_t sample_rate = 16000; + bool use_gpu = false; + + // true to show debug information when loading models + bool debug = false; + + VadModelConfig() = default; + + VadModelConfig(const SileroVadModelConfig &silero_vad, int32_t sample_rate, + bool use_gpu, bool debug) + : silero_vad(silero_vad), + sample_rate(sample_rate), + use_gpu(use_gpu), + debug(debug) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_VAD_MODEL_CONFIG_H_ diff --git a/sherpa/csrc/voice-activity-detector-impl.cc b/sherpa/csrc/voice-activity-detector-impl.cc new file mode 100644 index 000000000..c7b95e8b1 --- /dev/null +++ b/sherpa/csrc/voice-activity-detector-impl.cc @@ -0,0 +1,18 @@ +// sherpa/csrc/voice-activity-detector-impl.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/voice-activity-detector-impl.h" + +#include + +#include "sherpa/csrc/voice-activity-detector-silero-vad-impl.h" + +namespace sherpa { + +std::unique_ptr VoiceActivityDetectorImpl::Create( + const VoiceActivityDetectorConfig &config) { + return std::make_unique(config); +} + +} // namespace sherpa diff --git a/sherpa/csrc/voice-activity-detector-impl.h b/sherpa/csrc/voice-activity-detector-impl.h new file mode 100644 index 000000000..6572ddf6c --- /dev/null +++ b/sherpa/csrc/voice-activity-detector-impl.h @@ -0,0 +1,28 @@ +// sherpa/csrc/voice-activity-detector-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_IMPL_H_ +#define SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_IMPL_H_ +#include +#include + +#include "sherpa/csrc/voice-activity-detector.h" +#include "torch/script.h" + +namespace sherpa { + +class VoiceActivityDetectorImpl { + public: + static std::unique_ptr Create( + const VoiceActivityDetectorConfig &config); + + virtual ~VoiceActivityDetectorImpl() = default; + + virtual const VoiceActivityDetectorConfig &GetConfig() const = 0; + + virtual std::vector Process(torch::Tensor samples) = 0; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_IMPL_H_ diff --git a/sherpa/csrc/voice-activity-detector-silero-vad-impl.h b/sherpa/csrc/voice-activity-detector-silero-vad-impl.h new file mode 100644 index 000000000..4f6f09117 --- /dev/null +++ b/sherpa/csrc/voice-activity-detector-silero-vad-impl.h @@ -0,0 +1,208 @@ +// sherpa/csrc/voice-activity-detector-silero-vad-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_SILERO_VAD_IMPL_H_ +#define SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_SILERO_VAD_IMPL_H_ + +#include +#include +#include +#include + +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/silero-vad-model.h" +#include "sherpa/csrc/voice-activity-detector-impl.h" +#include "torch/torch.h" + +namespace sherpa { + +std::vector MergeSegments(std::vector segments) { + std::vector ans; + ans.reserve(segments.size()); + if (segments.empty()) { + return ans; + } + + ans.push_back(std::move(segments[0])); + for (int32_t i = 1; i < static_cast(segments.size()); ++i) { + if (ans.back().end + 0.1 >= segments[i].start) { + ans.back().end = segments[i].end; + } else { + ans.push_back(std::move(segments[i])); + } + } + + return ans; +} + +static std::vector ProcessSegment( + const float *p, int32_t n, const VoiceActivityDetectorConfig &config, + int32_t offset, std::vector *out) { + std::vector ans; + float threshold = config.model.silero_vad.threshold; + int32_t temp_start = 0; + int32_t temp_end = 0; + bool triggered = false; + + int32_t window_size = 512; + float sr = 16000.0f; + + float left_shift = 2 * window_size / sr + 0.15; + float right_shift = 2 * window_size / sr; + + int32_t min_speech_samples = config.model.silero_vad.min_speech_duration * + config.model.sample_rate / 512; + + int32_t min_silence_samples = config.model.silero_vad.min_silence_duration * + config.model.sample_rate / 512; + + for (int32_t i = 0; i < n; ++i) { + float prob = p[i]; + + if (prob > threshold && temp_end != 0) { + temp_end = 0; + } + + if (prob > threshold && temp_start == 0) { + // start speaking, but we require that it must satisfy + // min_speech_duration + temp_start = i; + continue; + } + + if (prob > threshold && temp_start != 0 && !triggered) { + if (i - temp_start < min_speech_samples) { + continue; + } + triggered = true; + continue; + } + + if ((prob < threshold) && !triggered) { + // silence + temp_start = 0; + temp_end = 0; + continue; + } + + if ((prob > threshold - 0.15) && triggered) { + // speaking + continue; + } + + if ((prob > threshold) && !triggered) { + // start speaking + triggered = true; + + continue; + } + + if ((prob < threshold) && triggered) { + // stop speaking + if (temp_end == 0) { + temp_end = i; + } + + if (i - temp_end < min_silence_samples) { + // continue speaking + continue; + } + // stopped speaking + + float start_time = (temp_start + offset) * window_size / sr - left_shift; + float end_time = (i + offset) * window_size / sr + right_shift; + + start_time = std::max(start_time, 0.0f); + + out->push_back({start_time, end_time}); + + temp_start = 0; + temp_end = 0; + triggered = false; + } + } // for (int32_t i = 0; i < n; ++i) + + if (triggered) { + float start_time = (temp_start + offset) * window_size / sr - left_shift; + float end_time = (n - 1 + offset) * window_size / sr + right_shift; + + start_time = std::max(start_time, 0.0f); + + out->push_back({start_time, end_time}); + } + return ans; +} + +class VoiceActivityDetectorSileroVadImpl : public VoiceActivityDetectorImpl { + public: + explicit VoiceActivityDetectorSileroVadImpl( + const VoiceActivityDetectorConfig &config) + : config_(config), + model_(std::make_unique(config.model)) {} + + const VoiceActivityDetectorConfig &GetConfig() const override { + return config_; + } + + std::vector Process(torch::Tensor samples) override { + if (samples.dim() != 1) { + SHERPA_LOGE("Expect 1-d tensor. Given: %d", + static_cast(samples.dim())); + SHERPA_EXIT(-1); + } + + int32_t segment_size = config_.model.sample_rate * config_.segment_size; + + int32_t num_samples = samples.size(0); + float audio_duration = num_samples / 16000.0; + + bool need_pad = + (num_samples > segment_size) && (num_samples % segment_size != 0); + + if (need_pad) { + int32_t padding = segment_size - num_samples % segment_size; + samples = torch::nn::functional::pad( + samples, torch::nn::functional::PadFuncOptions({0, padding}) + .mode(torch::kConstant) + .value(0)); + } + + int32_t num_batches = need_pad ? samples.size(0) / segment_size : 1; + + if (need_pad) { + samples = + samples.as_strided({num_batches, segment_size}, {segment_size, 1}); + } else { + samples = samples.reshape({1, -1}); + } + + auto device = model_->Device(); + torch::Tensor probs = model_->Run(samples.to(device)).cpu(); + // probs (batch_size, num_frames) + int32_t num_frames = probs.size(1); + + std::vector segments; + + for (int32_t i = 0; i < num_batches; ++i) { + const float *p = probs.data_ptr() + i * num_frames; + ProcessSegment(p, num_frames, config_, i * num_frames, &segments); + } + + segments = MergeSegments(std::move(segments)); + + for (auto &s : segments) { + s.start = std::min(s.start, audio_duration); + s.end = std::min(s.end, audio_duration); + } + + return segments; + } + + private: + VoiceActivityDetectorConfig config_; + std::unique_ptr model_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_SILERO_VAD_IMPL_H_ diff --git a/sherpa/csrc/voice-activity-detector.cc b/sherpa/csrc/voice-activity-detector.cc new file mode 100644 index 000000000..bc65d93b6 --- /dev/null +++ b/sherpa/csrc/voice-activity-detector.cc @@ -0,0 +1,65 @@ +// sherpa/csrc/voice-activity-detector.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/voice-activity-detector.h" + +#include +#include + +#include "sherpa/csrc/macros.h" +#include "sherpa/csrc/voice-activity-detector-impl.h" + +namespace sherpa { + +void VoiceActivityDetectorConfig::Register(ParseOptions *po) { + model.Register(po); + + po->Register("vad-segment-size", &segment_size, + "In seconds. Split input audio into segments and process them " + "in a batch"); + + po->Register("vad-batch-size", &batch_size, "Batch size"); +} + +bool VoiceActivityDetectorConfig::Validate() const { + if (segment_size < 0) { + SHERPA_LOGE("--vad-segment-size='%.3f' is less than 0", segment_size); + return false; + } + + if (batch_size < 1) { + SHERPA_LOGE("--vad-batch-size='%.3f' is less than 1", segment_size); + return false; + } + + return model.Validate(); +} + +std::string VoiceActivityDetectorConfig::ToString() const { + std::ostringstream os; + + os << "VoiceActivityDetectorConfig("; + os << "model=" << model.ToString() << ", "; + os << "segment_size=" << segment_size << ", "; + os << "batch_size=" << batch_size << ")"; + + return os.str(); +} + +VoiceActivityDetector::VoiceActivityDetector( + const VoiceActivityDetectorConfig &config) + : impl_(VoiceActivityDetectorImpl::Create(config)) {} + +VoiceActivityDetector::~VoiceActivityDetector() = default; + +const VoiceActivityDetectorConfig &VoiceActivityDetector::GetConfig() const { + return impl_->GetConfig(); +} + +std::vector VoiceActivityDetector::Process( + torch::Tensor samples) const { + return impl_->Process(samples); +} + +} // namespace sherpa diff --git a/sherpa/csrc/voice-activity-detector.h b/sherpa/csrc/voice-activity-detector.h new file mode 100644 index 000000000..23a8074d0 --- /dev/null +++ b/sherpa/csrc/voice-activity-detector.h @@ -0,0 +1,58 @@ +// sherpa/csrc/voice-activity-detector.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_H_ +#define SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_H_ + +#include +#include +#include + +#include "sherpa/cpp_api/parse-options.h" +#include "sherpa/csrc/vad-model-config.h" +#include "torch/script.h" + +namespace sherpa { + +struct SpeechSegment { + float start; // seconds + float end; // seconds +}; + +struct VoiceActivityDetectorConfig { + VadModelConfig model; + float segment_size = 10; // seconds + int32_t batch_size = 2; + + VoiceActivityDetectorConfig() = default; + VoiceActivityDetectorConfig(const VadModelConfig &model, float segment_size, + int32_t batch_size) + : model(model), segment_size(segment_size), batch_size(batch_size) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +class VoiceActivityDetectorImpl; + +class VoiceActivityDetector { + public: + explicit VoiceActivityDetector(const VoiceActivityDetectorConfig &config); + ~VoiceActivityDetector(); + + const VoiceActivityDetectorConfig &GetConfig() const; + + /* + * @param samples 1-D float32 tensor. + */ + std::vector Process(torch::Tensor samples) const; + + private: + std::unique_ptr impl_; +}; + +} // namespace sherpa + +#endif // SHERPA_CSRC_VOICE_ACTIVITY_DETECTOR_H_ diff --git a/sherpa/python/csrc/CMakeLists.txt b/sherpa/python/csrc/CMakeLists.txt index e6a6fdd03..d29e978c8 100644 --- a/sherpa/python/csrc/CMakeLists.txt +++ b/sherpa/python/csrc/CMakeLists.txt @@ -2,13 +2,24 @@ add_definitions(-DTORCH_API_INCLUDE_EXTENSION_H) # Please sort files alphabetically pybind11_add_module(_sherpa - hypothesis.cc - rnnt_beam_search.cc - rnnt_conformer_model.cc - rnnt_conv_emformer_model.cc - rnnt_emformer_model.cc - rnnt_model.cc + endpoint.cc + fast-beam-search-config.cc + feature-config.cc + offline-ctc-model.cc + offline-model-config.cc + offline-recognizer.cc + offline-sense-voice-model-config.cc + offline-stream.cc + offline-whisper-model-config.cc + online-recognizer.cc + online-stream.cc + resample.cc sherpa.cc + silero-vad-model-config.cc + speaker-embedding-extractor.cc + vad-model-config.cc + voice-activity-detector-config.cc + voice-activity-detector.cc ) if(APPLE) @@ -27,11 +38,11 @@ endif() target_link_libraries(_sherpa PRIVATE sherpa_core) +target_link_libraries(_sherpa PRIVATE sherpa_cpp_api) target_include_directories(_sherpa PRIVATE ${CMAKE_BINARY_DIR}) if(UNIX AND NOT APPLE) target_link_libraries(_sherpa PRIVATE ${TORCH_DIR}/lib/libtorch_python.so) - target_link_libraries(_sherpa PRIVATE ${PYTHON_LIBRARY}) elseif(WIN32) target_link_libraries(_sherpa PRIVATE ${TORCH_DIR}/lib/torch_python.lib) target_link_libraries(_sherpa PRIVATE ${PYTHON_LIBRARIES}) diff --git a/sherpa/python/csrc/endpoint.cc b/sherpa/python/csrc/endpoint.cc new file mode 100644 index 000000000..b7e738aa2 --- /dev/null +++ b/sherpa/python/csrc/endpoint.cc @@ -0,0 +1,97 @@ +// sherpa/python/csrc/endpoint.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/python/csrc/endpoint.h" + +#include +#include + +#include "sherpa/cpp_api/endpoint.h" + +namespace sherpa { + +static constexpr const char *kEndpointRuleInitDoc = R"doc( +Constructor for EndpointRule. + +Args: + must_contain_nonsilence: + If True, for this endpointing rule to apply there must be nonsilence in the + best-path traceback. For decoding, a non-blank token is considered as + non-silence. + min_trailing_silence: + This endpointing rule requires duration of trailing silence (in seconds) to + be >= this value. + min_utterance_length: + This endpointing rule requires utterance-length (in seconds) to be >= this + value. +)doc"; + +static constexpr const char *kEndpointConfigInitDoc = R"doc( +If any rule in EndpointConfig is activated, it is said that an endpointing +is detected. + +Args: + rule1: + By default, it times out after 2.4 seconds of silence, even if + we decoded nothing. + rule2: + By default, it times out after 1.2 seconds of silence after decoding + something. + rule3: + By default, it times out after the utterance is 20 seconds long, regardless of + anything else. +)doc"; + +static void PybindEndpointRule(py::module &m) { // NOLINT + using PyClass = EndpointRule; + py::class_(m, "EndpointRule") + .def(py::init([](bool must_contain_nonsilence = true, + float min_trailing_silence = 2.0, + float min_utterance_length = + 0.0f) -> std::unique_ptr { + auto ans = std::make_unique(); + + ans->must_contain_nonsilence = must_contain_nonsilence; + ans->min_trailing_silence = min_trailing_silence; + ans->min_utterance_length = min_utterance_length; + + return ans; + }), + py::arg("must_contain_nonsilence") = true, + py::arg("min_trailing_silence") = 2.0, + py::arg("min_utterance_length") = 0.0f, kEndpointRuleInitDoc) + .def("__str__", &PyClass::ToString) + .def_readwrite("must_contain_nonsilence", + &PyClass::must_contain_nonsilence) + .def_readwrite("min_trailing_silence", &PyClass::min_trailing_silence) + .def_readwrite("min_utterance_length", &PyClass::min_utterance_length); +} + +static void PybindEndpointConfig(py::module &m) { // NOLINT + using PyClass = EndpointConfig; + py::class_(m, "EndpointConfig") + .def(py::init([](const EndpointRule &rule1, const EndpointRule &rule2, + const EndpointRule &rule3) -> std::unique_ptr { + auto ans = std::make_unique(); + ans->rule1 = rule1; + ans->rule2 = rule2; + ans->rule3 = rule3; + return ans; + }), + py::arg("rule1") = EndpointRule(false, 2.4, 0), + py::arg("rule2") = EndpointRule(true, 1.2, 0), + py::arg("rule3") = EndpointRule(false, 0, 20), + kEndpointConfigInitDoc) + .def("__str__", + [](const PyClass &self) -> std::string { return self.ToString(); }) + .def_readwrite("rule1", &PyClass::rule1) + .def_readwrite("rule2", &PyClass::rule2) + .def_readwrite("rule3", &PyClass::rule3); +} + +void PybindEndpoint(py::module &m) { // NOLINT + PybindEndpointRule(m); + PybindEndpointConfig(m); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/endpoint.h b/sherpa/python/csrc/endpoint.h new file mode 100644 index 000000000..670e548ec --- /dev/null +++ b/sherpa/python/csrc/endpoint.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/endpoint.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_ENDPOINT_H_ +#define SHERPA_PYTHON_CSRC_ENDPOINT_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindEndpoint(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_ENDPOINT_H_ diff --git a/sherpa/python/csrc/fast-beam-search-config.cc b/sherpa/python/csrc/fast-beam-search-config.cc new file mode 100644 index 000000000..d0f410f26 --- /dev/null +++ b/sherpa/python/csrc/fast-beam-search-config.cc @@ -0,0 +1,52 @@ +// sherpa/python/csrc/fast-beam-search.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/fast-beam-search-config.h" + +#include +#include + +#include "sherpa/python/csrc/fast-beam-search-config.h" + +namespace sherpa { + +static constexpr const char *kFastBeamSearchConfigInitDoc = R"doc( +TODO +)doc"; + +void PybindFastBeamSearch(py::module &m) { // NOLINT + using PyClass = FastBeamSearchConfig; + py::class_(m, "FastBeamSearchConfig") + .def(py::init([](const std::string &lg = "", float ngram_lm_scale = 0.01, + float beam = 20.0, int32_t max_states = 64, + int32_t max_contexts = 8, + bool allow_partial = + false) -> std::unique_ptr { + auto config = std::make_unique(); + + config->lg = lg; + config->ngram_lm_scale = ngram_lm_scale; + config->beam = beam; + config->max_states = max_states; + config->max_contexts = max_contexts; + config->allow_partial = allow_partial; + + return config; + }), + py::arg("lg") = "", py::arg("ngram_lm_scale") = 0.01, + py::arg("beam") = 20.0, py::arg("max_states") = 64, + py::arg("max_contexts") = 8, py::arg("allow_partial") = false, + kFastBeamSearchConfigInitDoc) + .def_readwrite("lg", &PyClass::lg) + .def_readwrite("ngram_lm_scale", &PyClass::ngram_lm_scale) + .def_readwrite("beam", &PyClass::beam) + .def_readwrite("max_states", &PyClass::max_states) + .def_readwrite("max_contexts", &PyClass::max_contexts) + .def_readwrite("allow_partial", &PyClass::allow_partial) + .def("validate", &PyClass::Validate) + .def("__str__", + [](const PyClass &self) -> std::string { return self.ToString(); }); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/fast-beam-search-config.h b/sherpa/python/csrc/fast-beam-search-config.h new file mode 100644 index 000000000..a130b9785 --- /dev/null +++ b/sherpa/python/csrc/fast-beam-search-config.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/fast-beam-search.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_FAST_BEAM_SEARCH_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_FAST_BEAM_SEARCH_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindFastBeamSearch(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_FAST_BEAM_SEARCH_CONFIG_H_ diff --git a/sherpa/python/csrc/feature-config.cc b/sherpa/python/csrc/feature-config.cc new file mode 100644 index 000000000..767e949bd --- /dev/null +++ b/sherpa/python/csrc/feature-config.cc @@ -0,0 +1,60 @@ +// sherpa/python/csrc/feature-config.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/python/csrc/feature-config.h" + +#include +#include + +#include "sherpa/cpp_api/feature-config.h" + +namespace sherpa { + +static constexpr const char *kFeatureConfigInitDoc = R"doc( +Constructor for FeatureConfig. + +Args: + fbank_opts: + Options for computing fbank features. + normalize_samples: + In sherpa, the input audio samples should always be normalized to the + range ``[-1, 1]``. If ``normalize_samples`` is ``False``, we will scale + the input audio samples by ``32767`` inside sherpa. If ``normalize_samples`` + is ``True``, we use input audio samples as they are. + nemo_normalize: + Used only for NeMo CTC models. Leave it to empty if no normalization + is used in NeMo. Current implemented method is "per_feature". +)doc"; + +void PybindFeatureConfig(py::module &m) { // NOLINT + using PyClass = FeatureConfig; + py::class_(m, "FeatureConfig") + .def(py::init([](bool normalize_samples = true, + const std::string &nemo_normalize = + "") -> std::unique_ptr { + auto config = std::make_unique(); + + config->normalize_samples = normalize_samples; + config->nemo_normalize = nemo_normalize; + config->fbank_opts.frame_opts.dither = 0; + config->fbank_opts.mel_opts.num_bins = 80; + config->fbank_opts.mel_opts.high_freq = -400; + config->fbank_opts.frame_opts.remove_dc_offset = true; + config->fbank_opts.frame_opts.round_to_power_of_two = true; + config->fbank_opts.energy_floor = 1e-10; + config->fbank_opts.frame_opts.snip_edges = false; + config->fbank_opts.frame_opts.samp_freq = 16000; + + return config; + }), + py::arg("normalize_samples") = true, py::arg("nemo_normalize") = "", + kFeatureConfigInitDoc) + .def_readwrite("fbank_opts", &PyClass::fbank_opts) + .def_readwrite("normalize_samples", &PyClass::normalize_samples) + .def_readwrite("nemo_normalize", &PyClass::nemo_normalize) + .def("__str__", + [](const PyClass &self) -> std::string { return self.ToString(); }); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/feature-config.h b/sherpa/python/csrc/feature-config.h new file mode 100644 index 000000000..8841c9a43 --- /dev/null +++ b/sherpa/python/csrc/feature-config.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/feature-config.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_FEATURE_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_FEATURE_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindFeatureConfig(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_FEATURE_CONFIG_H_ diff --git a/sherpa/python/csrc/hypothesis.cc b/sherpa/python/csrc/hypothesis.cc deleted file mode 100644 index 75974defe..000000000 --- a/sherpa/python/csrc/hypothesis.cc +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/python/csrc/hypothesis.h" - -#include -#include - -#include "sherpa/csrc/hypothesis.h" - -namespace sherpa { - -void PybindHypothesis(py::module &m) { // NOLINT - { - using PyClass = Hypothesis; - py::class_(m, "Hypothesis") - .def(py::init<>()) - .def(py::init &, double>(), py::arg("ys"), - py::arg("log_prob")) - - .def_property_readonly("key", &PyClass::Key) - .def_property_readonly( - "log_prob", - [](const PyClass &self) -> double { return self.log_prob; }) - .def_property_readonly( - "ys", - [](const PyClass &self) -> std::vector { return self.ys; }) - .def_property_readonly("num_trailing_blanks", - [](const PyClass &self) -> int32_t { - return self.num_trailing_blanks; - }) - .def("__str__", - [](const PyClass &self) -> std::string { return self.ToString(); }) - .def("__repr__", [](const PyClass &self) -> std::string { - return self.ToString(); - }); - } - - { - using PyClass = Hypotheses; - py::class_(m, "Hypotheses") - .def(py::init<>()) - .def(py::init>(), py::arg("hyps")) - .def("get_most_probable", &PyClass::GetMostProbable, - py::arg("length_norm"), py::call_guard()); - } -} - -} // namespace sherpa diff --git a/sherpa/python/csrc/hypothesis.h b/sherpa/python/csrc/hypothesis.h deleted file mode 100644 index c42762a4a..000000000 --- a/sherpa/python/csrc/hypothesis.h +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_PYTHON_CSRC_HYPOTHESIS_H_ -#define SHERPA_PYTHON_CSRC_HYPOTHESIS_H_ - -#include "sherpa/python/csrc/sherpa.h" - -namespace sherpa { - -void PybindHypothesis(py::module &m); // NOLINT - -} - -#endif // SHERPA_PYTHON_CSRC_HYPOTHESIS_H_ diff --git a/sherpa/python/csrc/offline-ctc-model.cc b/sherpa/python/csrc/offline-ctc-model.cc new file mode 100644 index 000000000..ae3b2b832 --- /dev/null +++ b/sherpa/python/csrc/offline-ctc-model.cc @@ -0,0 +1,87 @@ +// sherpa/python/csrc/offline-ctc-model.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/python/csrc/offline-ctc-model.h" + +#include + +#include "sherpa/csrc/offline-ctc-model.h" + +namespace sherpa { + +static constexpr const char *kWarmUpDoc = R"doc( +Send some fake data to the model for computation. + +.. hint:: + + It is called when the model is first loaded into memory to + reduce the response time of the first client request. + +Args: + features: + It is usually a tensor of shape ``(N, T, C)`` containing the features. + But for ``wav2vec 2.0``, it should be a tensor of shape ``(N, num_samples)``. + + features_length: + It indicates number of valid frames or audio samples in ``features`` + before padding. Its shape is ``(N,)``. + +Returns: + Return ``None``. +)doc"; + +static constexpr const char *kForwardDoc = R"doc( +Run the forward method of the network. + +Args: + features: + It is usually a tensor of shape ``(N, T, C)`` containing the features. + But for ``wav2vec 2.0``, it should be a tensor of shape ``(N, num_samples)``. + + features_length: + It indicates number of valid frames or audio samples in ``features`` + before padding. Its shape is ``(N,)``. + +Returns: + Return a tuple containing two tensors: + + - ``log_probs``: Output of the log_softmax layer with shape ``(N, T, vocab_size)`` + + - ``log_probs_length``: A tensor of shape ``(N,)`` containing the valid number + of frames in ``log_probs`` +)doc"; + +void PybindOfflineCtcModel(py::module &m) { // NOLINT + using PyClass = OfflineCtcModel; + py::class_(m, "OfflineCtcModel") + // properties + .def_property_readonly("subsampling_factor", &PyClass::SubsamplingFactor) + .def_property_readonly("vocab_size", &PyClass::VocabSize) + .def_property_readonly("device", + [](const PyClass &self) -> py::object { + py::object ans = + py::module_::import("torch").attr("device"); + return ans(self.Device().str()); + }) + // methods + .def("warm_up", &PyClass::WarmUp, py::arg("features"), + py::arg("features_length"), py::call_guard(), + kWarmUpDoc) + .def( + "forward", + [](PyClass &self, torch::Tensor features, + torch::Tensor features_length) + -> std::pair { + torch::IValue ivalue = self.Forward(features, features_length); + + auto log_probs = self.GetLogSoftmaxOut(ivalue); + auto log_probs_length = self.GetLogSoftmaxOutLength(ivalue); + + return {log_probs, log_probs_length}; + }, + py::arg("features"), py::arg("features_length"), + py::call_guard(), kForwardDoc); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/offline-ctc-model.h b/sherpa/python/csrc/offline-ctc-model.h new file mode 100644 index 000000000..c83ecc631 --- /dev/null +++ b/sherpa/python/csrc/offline-ctc-model.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/offline-ctc-model.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_OFFLINE_CTC_MODEL_H_ +#define SHERPA_PYTHON_CSRC_OFFLINE_CTC_MODEL_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOfflineCtcModel(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_OFFLINE_CTC_MODEL_H_ diff --git a/sherpa/python/csrc/offline-model-config.cc b/sherpa/python/csrc/offline-model-config.cc new file mode 100644 index 000000000..c0e9c27b1 --- /dev/null +++ b/sherpa/python/csrc/offline-model-config.cc @@ -0,0 +1,38 @@ +// sherpa/python/csrc/offline-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/python/csrc/offline-model-config.h" + +#include +#include + +#include "sherpa/csrc/offline-model-config.h" +#include "sherpa/python/csrc/offline-sense-voice-model-config.h" +#include "sherpa/python/csrc/offline-whisper-model-config.h" + +namespace sherpa { + +void PybindOfflineModelConfig(py::module *m) { + PybindOfflineSenseVoiceModelConfig(m); + PybindOfflineWhisperModelConfig(m); + + using PyClass = OfflineModelConfig; + py::class_(*m, "OfflineModelConfig") + .def(py::init(), + py::arg("sense_voice") = OfflineSenseVoiceModelConfig(), + py::arg("whisper") = OfflineWhisperModelConfig(), + py::arg("tokens") = "", py::arg("debug") = false, + py::arg("use_gpu") = false) + .def_readwrite("sense_voice", &PyClass::sense_voice) + .def_readwrite("whisper", &PyClass::whisper) + .def_readwrite("tokens", &PyClass::tokens) + .def_readwrite("debug", &PyClass::debug) + .def_readwrite("use_gpu", &PyClass::use_gpu) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/offline-model-config.h b/sherpa/python/csrc/offline-model-config.h new file mode 100644 index 000000000..88a9591c2 --- /dev/null +++ b/sherpa/python/csrc/offline-model-config.h @@ -0,0 +1,16 @@ +// sherpa/python/csrc/offline-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_PYTHON_CSRC_OFFLINE_MODEL_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_OFFLINE_MODEL_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOfflineModelConfig(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_OFFLINE_MODEL_CONFIG_H_ diff --git a/sherpa/python/csrc/offline-recognizer.cc b/sherpa/python/csrc/offline-recognizer.cc new file mode 100644 index 000000000..b5a8ecaa0 --- /dev/null +++ b/sherpa/python/csrc/offline-recognizer.cc @@ -0,0 +1,238 @@ +// sherpa/python/csrc/offline-recognizer.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/offline-recognizer.h" + +#include +#include +#include + +#include "sherpa/python/csrc/offline-model-config.h" +#include "sherpa/python/csrc/offline-recognizer.h" + +namespace sherpa { + +static constexpr const char *kOfflineCtcDecoderConfigInitDoc = R"doc( +Constructor for offline CTC decoder configuration. + +Args: + modified: + ``True`` to use a modified CTC topology. ``False`` to use a standard + CTC topology. Please visit + ``_ + for the difference between modified and standard CTC topology. + hlg: + Optional. If empty, we use an ``H`` for decoding, where ``H`` is a + CTC topology. + If not empty, it is the path to ``HLG.pt`` and we use an ``HLG`` graph + for decoding. Please refer to + ``_ + for how to build an ``HLG`` graph. + search_beam: + Decoding beam, e.g. 20. Smaller is faster, larger is more exact (less + pruning). This is the default value; it may be modified by + ``min_active_states`` and ``max_active_states``. + output_beam: + Beam to prune output, similar to lattice-beam in Kaldi. + Relative to the best path of output. + min_active_states: + Minimum number of FSA states that are allowed to be active on any given + frame for any given intersection/composition task. This is advisory, in + that it will try not to have fewer than this number active. Set it to zero + if there is no constraint. + max_active_states: + Maximum number of FSA states that are allowed to be active on any given + frame for any given intersection/composition task. This is advisory, in + that it will try not to exceed that but may not always succeed. You can use + a very large number if no constraint is needed. + lm_scale: + Used only when HLG is not empty. It specifies the scale for HLG.scores. +)doc"; + +static constexpr const char *kOfflineRecognizerConfigInitDoc = R"doc( +Constructor for the offline recognizer configuration. + +Args: + nn_model: + Path to the torchscript model. We support the following types of models: + + (1) CTC. Models from the following frameworks are supported: + + - icefall. It supports models from the ``conformer_ctc`` recipe. + - wenet. It supports all models trained using CTC from wenet. We discard + the transformer decoder branch and only use the transformer + encoder for CTC decoding. + - torchaudio. We support wav2vec 2.0 models from torchaudio. + - NeMo. We support EncDecCTCModelBPE from NeMo. + + (2) Transducer. Models from the following frameworks are supported. + + - icefall. It supports models from the ``pruend_transducer_statelessX`` + recipe. + + Please visit the following links for pre-trained CTC and transducer models: + + - ``_ + - ``_ + tokens: + Path to ``tokens.txt``. Note: Different frameworks use different names + for this file. Basically, it is a text file, where each row contains two + columns separated by space(s). The first column is a symbol and the second + column is the corresponding integer ID of the symbol. The text file has + as many rows as the vocabulary size of the model. + use_gpu: + ``False`` to use CPU for neural network computation and decoding. + ``True`` to use GPU for neural network computation and decoding. + + .. note:: + + If ``use_gpu`` is ``True``, we always use ``GPU 0``. You can use + the environment variable ``CUDA_VISIBLE_DEVICES`` to control which + GPU is mapped to ``GPU 0``. + num_active_paths: + Used only for modified_beam_search in transducer decoding. It is ignored + if the passed ``nn_model`` is a CTC model. + context_score: + The bonus score for each token in context word/phrase. + Used only when decoding_method is modified_beam_search. + ctc_decoder_config: + Used only when the passed ``nn_model`` is a CTC model. It is ignored if + the passed ``nn_model`` is a transducer model. + feat_config: + It contains the configuration for offline fbank extractor. + fast_beam_search_config: + Used only for fast_beam_search in transducer decoding. It is ignored if + the passed ``nn_model`` is a CTC model. Also, if the decoding_method is + not ``fast_beam_search``, it is ignored. + decoding_method: + Used only when the passed ``nn_model`` is a transducer model. + Valid values are: ``greedy_search``, ``modified_beam_search``, and + ``fast_beam_search``. +)doc"; + +static void PybindOfflineCtcDecoderConfig(py::module &m) { // NOLINT + using PyClass = OfflineCtcDecoderConfig; + py::class_(m, "OfflineCtcDecoderConfig") + .def(py::init([](bool modified = true, const std::string &hlg = "", + float search_beam = 20, float output_beam = 8, + int32_t min_active_states = 20, + int32_t max_active_states = 10000, + float lm_scale = + 1.0f) -> std::unique_ptr { + auto ans = std::make_unique(); + + ans->modified = modified; + ans->hlg = hlg; + ans->lm_scale = lm_scale; + ans->output_beam = output_beam; + ans->search_beam = search_beam; + ans->output_beam = output_beam; + ans->min_active_states = min_active_states; + ans->max_active_states = max_active_states; + + return ans; + }), + py::arg("modified") = true, py::arg("hlg") = "", + py::arg("search_beam") = 20.0, py::arg("output_beam") = 8.0, + py::arg("min_active_states") = 20, + py::arg("max_active_states") = 10000, py::arg("lm_scale") = 1.0, + kOfflineCtcDecoderConfigInitDoc) + .def_readwrite("modified", &PyClass::modified) + .def_readwrite("hlg", &PyClass::hlg) + .def_readwrite("search_beam", &PyClass::search_beam) + .def_readwrite("output_beam", &PyClass::output_beam) + .def_readwrite("min_active_states", &PyClass::min_active_states) + .def_readwrite("max_active_states", &PyClass::max_active_states) + .def_readwrite("lm_scale", &PyClass::lm_scale) + .def("__str__", + [](const PyClass &self) -> std::string { return self.ToString(); }) + .def("validate", &PyClass::Validate); +} + +static void PybindOfflineRecognizerConfig(py::module &m) { // NOLINT + using PyClass = OfflineRecognizerConfig; + PybindOfflineModelConfig(&m); + + py::class_(m, "OfflineRecognizerConfig") + .def(py::init([](const OfflineModelConfig &model, + const std::string &nn_model, const std::string &tokens, + bool use_gpu = false, int32_t num_active_paths = 4, + float context_score = 1.5, bool use_bbpe = false, + float temperature = 1.0, + const OfflineCtcDecoderConfig &ctc_decoder_config = {}, + const FeatureConfig &feat_config = {}, + const FastBeamSearchConfig &fast_beam_search_config = {}, + const std::string &decoding_method = "greedy_search") + -> std::unique_ptr { + auto config = std::make_unique(); + + config->ctc_decoder_config = ctc_decoder_config; + config->feat_config = feat_config; + config->fast_beam_search_config = fast_beam_search_config; + config->model = model; + config->nn_model = nn_model; + config->tokens = tokens; + config->use_gpu = use_gpu; + config->decoding_method = decoding_method; + config->num_active_paths = num_active_paths; + config->context_score = context_score; + config->use_bbpe = use_bbpe; + config->temperature = temperature; + + return config; + }), + py::arg("model") = OfflineModelConfig{}, py::arg("nn_model") = "", + py::arg("tokens") = "", py::arg("use_gpu") = false, + py::arg("num_active_paths") = 4, py::arg("context_score") = 1.5, + py::arg("use_bbpe") = false, py::arg("temperature") = 1.0, + py::arg("ctc_decoder_config") = OfflineCtcDecoderConfig(), + py::arg("feat_config") = FeatureConfig(), + py::arg("fast_beam_search_config") = FastBeamSearchConfig(), + py::arg("decoding_method") = "greedy_search", + kOfflineRecognizerConfigInitDoc) + .def("__str__", + [](const PyClass &self) -> std::string { return self.ToString(); }) + .def_readwrite("ctc_decoder_config", &PyClass::ctc_decoder_config) + .def_readwrite("feat_config", &PyClass::feat_config) + .def_readwrite("fast_beam_search_config", + &PyClass::fast_beam_search_config) + .def_readwrite("nn_model", &PyClass::nn_model) + .def_readwrite("tokens", &PyClass::tokens) + .def_readwrite("use_gpu", &PyClass::use_gpu) + .def_readwrite("decoding_method", &PyClass::decoding_method) + .def_readwrite("num_active_paths", &PyClass::num_active_paths) + .def_readwrite("context_score", &PyClass::context_score) + .def_readwrite("use_bbpe", &PyClass::use_bbpe) + .def_readwrite("temperature", &PyClass::temperature) + .def("validate", &PyClass::Validate); +} + +void PybindOfflineRecognizer(py::module &m) { // NOLINT + PybindOfflineCtcDecoderConfig(m); + PybindOfflineRecognizerConfig(m); + + using PyClass = OfflineRecognizer; + py::class_(m, "OfflineRecognizer") + .def(py::init(), py::arg("config")) + .def( + "create_stream", [](PyClass &self) { return self.CreateStream(); }, + py::call_guard()) + .def( + "create_stream", + [](PyClass &self, + const std::vector> &contexts_list) { + return self.CreateStream(contexts_list); + }, + py::arg("contexts_list"), py::call_guard()) + .def("decode_stream", &PyClass::DecodeStream, py::arg("s"), + py::call_guard()) + .def( + "decode_streams", + [](PyClass &self, std::vector &ss) { + self.DecodeStreams(ss.data(), ss.size()); + }, + py::arg("ss"), py::call_guard()); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/offline-recognizer.h b/sherpa/python/csrc/offline-recognizer.h new file mode 100644 index 000000000..a70d26584 --- /dev/null +++ b/sherpa/python/csrc/offline-recognizer.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/offline-recognizer.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_OFFLINE_RECOGNIZER_H_ +#define SHERPA_PYTHON_CSRC_OFFLINE_RECOGNIZER_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOfflineRecognizer(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_OFFLINE_RECOGNIZER_H_ diff --git a/sherpa/python/csrc/offline-sense-voice-model-config.cc b/sherpa/python/csrc/offline-sense-voice-model-config.cc new file mode 100644 index 000000000..68c887bb5 --- /dev/null +++ b/sherpa/python/csrc/offline-sense-voice-model-config.cc @@ -0,0 +1,27 @@ +// sherpa/python/csrc/offline-sense-voice-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/csrc/offline-sense-voice-model-config.h" + +#include +#include + +#include "sherpa/python/csrc/offline-sense-voice-model-config.h" + +namespace sherpa { + +void PybindOfflineSenseVoiceModelConfig(py::module *m) { + using PyClass = OfflineSenseVoiceModelConfig; + py::class_(*m, "OfflineSenseVoiceModelConfig") + .def(py::init<>()) + .def(py::init(), + py::arg("model"), py::arg("language"), py::arg("use_itn")) + .def_readwrite("model", &PyClass::model) + .def_readwrite("language", &PyClass::language) + .def_readwrite("use_itn", &PyClass::use_itn) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/offline-sense-voice-model-config.h b/sherpa/python/csrc/offline-sense-voice-model-config.h new file mode 100644 index 000000000..1ed3dd905 --- /dev/null +++ b/sherpa/python/csrc/offline-sense-voice-model-config.h @@ -0,0 +1,16 @@ +// sherpa/python/csrc/offline-sense-voice-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_PYTHON_CSRC_OFFLINE_SENSE_VOICE_MODEL_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_OFFLINE_SENSE_VOICE_MODEL_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOfflineSenseVoiceModelConfig(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_OFFLINE_SENSE_VOICE_MODEL_CONFIG_H_ diff --git a/sherpa/python/csrc/offline-stream.cc b/sherpa/python/csrc/offline-stream.cc new file mode 100644 index 000000000..86f3eb62e --- /dev/null +++ b/sherpa/python/csrc/offline-stream.cc @@ -0,0 +1,76 @@ +// sherpa/python/csrc/offline-stream.cc +// +// Copyright (c) 2022 Xiaomi Corporation + +#include "sherpa/cpp_api/offline-stream.h" + +#include + +#include "sherpa/python/csrc/offline-stream.h" +#include "torch/torch.h" + +namespace sherpa { + +static constexpr const char *kOfflineStreamAcceptSamplesVectorDoc = R"doc( +Accept samples from a list of floats. + +Args: + samples: + It contains audio samples normalized to the range ``[-1, 1].`` + Note: The sampling rate of the samples should match the one expected + by the feature extractor. +)doc"; + +static constexpr const char *kOfflineStreamAcceptSamplesTensorDoc = R"doc( +Accept samples from a 1-D float32 tensor . + +Args: + samples: + It contains audio samples normalized to the range ``[-1, 1].`` + Note: The sampling rate of the samples should match the one expected + by the feature extractor. +)doc"; + +static void PybindOfflineRecognitionResult(py::module &m) { // NOLINT + using PyClass = OfflineRecognitionResult; + py::class_(m, "OfflineRecognitionResult") + .def_property_readonly("text", + [](const PyClass &self) { return self.text; }) + .def_property_readonly("tokens", + [](const PyClass &self) { return self.tokens; }) + .def_property_readonly( + "timestamps", [](const PyClass &self) { return self.timestamps; }) + .def("__str__", &PyClass::AsJsonString) + .def("as_json_string", &PyClass::AsJsonString); +} + +void PybindOfflineStream(py::module &m) { // NOLINT + PybindOfflineRecognitionResult(m); + using PyClass = OfflineStream; + + py::class_ stream(m, "OfflineStream"); + stream + .def("accept_wave_file", &PyClass::AcceptWaveFile, + py::call_guard(), py::arg("filename")) + .def( + "accept_samples", + [](PyClass &self, const std::vector &samples) { + self.AcceptSamples(samples.data(), samples.size()); + }, + py::arg("samples"), py::call_guard(), + kOfflineStreamAcceptSamplesVectorDoc) + .def( + "accept_samples", + [](PyClass &self, torch::Tensor samples) { + samples = samples.contiguous().cpu(); + self.AcceptSamples(samples.data_ptr(), samples.numel()); + }, + py::arg("samples"), py::call_guard(), + kOfflineStreamAcceptSamplesTensorDoc) + .def_property_readonly("result", &PyClass::GetResult); + + // alias + stream.attr("accept_waveform") = stream.attr("accept_samples"); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/offline-stream.h b/sherpa/python/csrc/offline-stream.h new file mode 100644 index 000000000..55a49350d --- /dev/null +++ b/sherpa/python/csrc/offline-stream.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/offline-stream.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_OFFLINE_STREAM_H_ +#define SHERPA_PYTHON_CSRC_OFFLINE_STREAM_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOfflineStream(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_OFFLINE_STREAM_H_ diff --git a/sherpa/python/csrc/offline-whisper-model-config.cc b/sherpa/python/csrc/offline-whisper-model-config.cc new file mode 100644 index 000000000..3cdbc7c1c --- /dev/null +++ b/sherpa/python/csrc/offline-whisper-model-config.cc @@ -0,0 +1,28 @@ +// sherpa/python/csrc/offline-whisper-model-config.cc +// +// Copyright (c) 2023 Xiaomi Corporation + +#include "sherpa/csrc/offline-whisper-model-config.h" + +#include +#include + +#include "sherpa/python/csrc/offline-whisper-model-config.h" + +namespace sherpa { + +void PybindOfflineWhisperModelConfig(py::module *m) { + using PyClass = OfflineWhisperModelConfig; + py::class_(*m, "OfflineWhisperModelConfig") + .def(py::init(), + py::arg("model") = "", py::arg("language") = "", + py::arg("task") = "transcribe") + .def_readwrite("model", &PyClass::model) + .def_readwrite("language", &PyClass::language) + .def_readwrite("task", &PyClass::task) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/offline-whisper-model-config.h b/sherpa/python/csrc/offline-whisper-model-config.h new file mode 100644 index 000000000..942015633 --- /dev/null +++ b/sherpa/python/csrc/offline-whisper-model-config.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/offline-whisper-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_PYTHON_CSRC_OFFLINE_WHISPER_MODEL_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_OFFLINE_WHISPER_MODEL_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOfflineWhisperModelConfig(py::module *m); +} + +#endif // SHERPA_PYTHON_CSRC_OFFLINE_WHISPER_MODEL_CONFIG_H_ diff --git a/sherpa/python/csrc/online-recognizer.cc b/sherpa/python/csrc/online-recognizer.cc new file mode 100644 index 000000000..53fb04a75 --- /dev/null +++ b/sherpa/python/csrc/online-recognizer.cc @@ -0,0 +1,123 @@ +// sherpa/python/csrc/online-recognizer.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/cpp_api/online-recognizer.h" + +#include +#include +#include + +#include "sherpa/python/csrc/online-recognizer.h" + +namespace sherpa { + +static void PybindOnlineRecognizerConfig(py::module &m) { // NOLINT + using PyClass = OnlineRecognizerConfig; + py::class_(m, "OnlineRecognizerConfig") + .def(py::init([](const std::string &nn_model, const std::string &tokens, + const std::string &encoder_model = {}, + const std::string &decoder_model = {}, + const std::string &joiner_model = {}, + bool use_gpu = false, bool use_endpoint = false, + const std::string &decoding_method = "greedy_search", + int32_t num_active_paths = 4, float context_score = 1.5, + int32_t left_context = 64, int32_t right_context = 0, + int32_t chunk_size = 16, bool use_bbpe = false, + float temperature = 1.0, + const FeatureConfig &feat_config = {}, + const EndpointConfig &endpoint_config = {}, + const FastBeamSearchConfig &fast_beam_search_config = {}) + -> std::unique_ptr { + auto ans = std::make_unique(); + + ans->feat_config = feat_config; + ans->endpoint_config = endpoint_config; + ans->fast_beam_search_config = fast_beam_search_config; + ans->nn_model = nn_model; + ans->tokens = tokens; + ans->encoder_model = encoder_model; + ans->decoder_model = decoder_model; + ans->joiner_model = joiner_model; + ans->use_gpu = use_gpu; + ans->use_endpoint = use_endpoint; + ans->decoding_method = decoding_method; + ans->num_active_paths = num_active_paths; + ans->context_score = context_score; + ans->left_context = left_context; + ans->right_context = right_context; + ans->chunk_size = chunk_size; + ans->use_bbpe = use_bbpe; + ans->temperature = temperature; + return ans; + }), + py::arg("nn_model"), py::arg("tokens"), + py::arg("encoder_model") = "", py::arg("decoder_model") = "", + py::arg("joiner_model") = "", py::arg("use_gpu") = false, + py::arg("use_endpoint") = false, + py::arg("decoding_method") = "greedy_search", + py::arg("num_active_paths") = 4, py::arg("context_score") = 1.5, + py::arg("left_context") = 64, py::arg("right_context") = 0, + py::arg("chunk_size") = 16, py::arg("use_bbpe") = false, + py::arg("temperature") = 1.0, + py::arg("feat_config") = FeatureConfig(), + py::arg("endpoint_config") = EndpointConfig(), + py::arg("fast_beam_search_config") = FastBeamSearchConfig()) + + .def_readwrite("feat_config", &PyClass::feat_config) + .def_readwrite("endpoint_config", &PyClass::endpoint_config) + .def_readwrite("fast_beam_search_config", + &PyClass::fast_beam_search_config) + .def_readwrite("nn_model", &PyClass::nn_model) + .def_readwrite("tokens", &PyClass::tokens) + .def_readwrite("encoder_model", &PyClass::encoder_model) + .def_readwrite("decoder_model", &PyClass::decoder_model) + .def_readwrite("joiner_model", &PyClass::joiner_model) + .def_readwrite("use_gpu", &PyClass::use_gpu) + .def_readwrite("use_endpoint", &PyClass::use_endpoint) + .def_readwrite("decoding_method", &PyClass::decoding_method) + .def_readwrite("num_active_paths", &PyClass::num_active_paths) + .def_readwrite("context_score", &PyClass::context_score) + .def_readwrite("left_context", &PyClass::left_context) + .def_readwrite("right_context", &PyClass::right_context) + .def_readwrite("chunk_size", &PyClass::chunk_size) + .def_readwrite("use_bbpe", &PyClass::use_bbpe) + .def_readwrite("temperature", &PyClass::temperature) + .def("validate", &PyClass::Validate) + .def("__str__", + [](const PyClass &self) -> std::string { return self.ToString(); }); +} + +void PybindOnlineRecognizer(py::module &m) { // NOLINT + PybindOnlineRecognizerConfig(m); + using PyClass = OnlineRecognizer; + py::class_(m, "OnlineRecognizer") + .def(py::init(), py::arg("config")) + .def( + "create_stream", [](PyClass &self) { return self.CreateStream(); }, + py::call_guard()) + .def( + "create_stream", + [](PyClass &self, + const std::vector> &contexts_list) { + return self.CreateStream(contexts_list); + }, + py::arg("contexts_list"), py::call_guard()) + .def("is_ready", &PyClass::IsReady, py::arg("s"), + py::call_guard()) + .def("is_endpoint", &PyClass::IsEndpoint, py::arg("s"), + py::call_guard()) + .def("decode_stream", &PyClass::DecodeStream, py::arg("s"), + py::call_guard()) + .def( + "decode_streams", + [](PyClass &self, std::vector &ss) { + self.DecodeStreams(ss.data(), ss.size()); + }, + py::arg("ss"), py::call_guard()) + .def("get_result", &PyClass::GetResult, py::arg("s"), + py::call_guard()) + .def_property_readonly("config", &PyClass::GetConfig, + py::call_guard()); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/online-recognizer.h b/sherpa/python/csrc/online-recognizer.h new file mode 100644 index 000000000..3e2b25282 --- /dev/null +++ b/sherpa/python/csrc/online-recognizer.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/online-recognizer.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_ONLINE_RECOGNIZER_H_ +#define SHERPA_PYTHON_CSRC_ONLINE_RECOGNIZER_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOnlineRecognizer(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_ONLINE_RECOGNIZER_H_ diff --git a/sherpa/python/csrc/online-stream.cc b/sherpa/python/csrc/online-stream.cc new file mode 100644 index 000000000..41ca433c3 --- /dev/null +++ b/sherpa/python/csrc/online-stream.cc @@ -0,0 +1,51 @@ +// sherpa/python/csrc/online-stream.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/cpp_api/online-stream.h" + +#include + +#include "sherpa/python/csrc/online-stream.h" +#include "torch/torch.h" + +namespace sherpa { + +static void PybindOnlineRecognitionResult(py::module &m) { // NOLINT + using PyClass = OnlineRecognitionResult; + py::class_(m, "OnlineRecognitionResult") + .def_property_readonly( + "text", [](const PyClass &self) { return self.text; }, + py::call_guard()) + .def_property_readonly( + "tokens", [](const PyClass &self) { return self.tokens; }, + py::call_guard()) + .def_property_readonly( + "timestamps", [](const PyClass &self) { return self.timestamps; }, + py::call_guard()) + .def_property_readonly( + "segment", [](const PyClass &self) { return self.segment; }, + py::call_guard()) + .def_property_readonly( + "start_time", [](const PyClass &self) { return self.start_time; }, + py::call_guard()) + .def_property_readonly( + "is_final", [](const PyClass &self) { return self.is_final; }, + py::call_guard()) + .def("__str__", &PyClass::AsJsonString, + py::call_guard()) + .def("as_json_string", &PyClass::AsJsonString, + py::call_guard()); +} + +void PybindOnlineStream(py::module &m) { // NOLINT + PybindOnlineRecognitionResult(m); + using PyClass = OnlineStream; + py::class_(m, "OnlineStream") + .def("accept_waveform", &PyClass::AcceptWaveform, + py::arg("sampling_rate"), py::arg("waveform"), + py::call_guard()) + .def("input_finished", &PyClass::InputFinished, + py::call_guard()); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/online-stream.h b/sherpa/python/csrc/online-stream.h new file mode 100644 index 000000000..95968337d --- /dev/null +++ b/sherpa/python/csrc/online-stream.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/online-stream.h +// +// Copyright (c) 2022 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_ONLINE_STREAM_H_ +#define SHERPA_PYTHON_CSRC_ONLINE_STREAM_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindOnlineStream(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_ONLINE_STREAM_H_ diff --git a/sherpa/python/csrc/resample.cc b/sherpa/python/csrc/resample.cc new file mode 100644 index 000000000..8f6299b4b --- /dev/null +++ b/sherpa/python/csrc/resample.cc @@ -0,0 +1,36 @@ +// sherpa/python/csrc/resample.cc +// +// Copyright (c) 2022 Xiaomi Corporation +#include "sherpa/csrc/resample.h" + +#include +#include + +#include "sherpa/python/csrc/resample.h" +#include "sherpa/python/csrc/sherpa.h" +#include "torch/torch.h" + +namespace sherpa { + +void PybindResample(py::module &m) { // NOLINT + using PyClass = LinearResample; + py::class_(m, "LinearResample") + .def(py::init([](int32_t samp_rate_in_hz, int32_t samp_rate_out_hz) { + float min_freq = std::min(samp_rate_in_hz, samp_rate_out_hz); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + return std::make_unique( + samp_rate_in_hz, samp_rate_out_hz, lowpass_cutoff, + lowpass_filter_width); + }), + py::arg("samp_rate_in_hz"), py::arg("samp_rate_out_hz")) + .def("reset", &PyClass::Reset) + .def("resample", &PyClass::Resample, py::arg("input"), py::arg("flush")) + .def_property_readonly("input_sample_rate", + &PyClass::GetInputSamplingRate) + .def_property_readonly("output_sample_rate", + &PyClass::GetOutputSamplingRate); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/resample.h b/sherpa/python/csrc/resample.h new file mode 100644 index 000000000..fb6898d62 --- /dev/null +++ b/sherpa/python/csrc/resample.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/resample.h +// +// Copyright (c) 2023 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_RESAMPLE_H_ +#define SHERPA_PYTHON_CSRC_RESAMPLE_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindResample(py::module &m); // NOLINT + +} + +#endif // SHERPA_PYTHON_CSRC_RESAMPLE_H_ diff --git a/sherpa/python/csrc/rnnt_beam_search.cc b/sherpa/python/csrc/rnnt_beam_search.cc deleted file mode 100644 index a873a4dc3..000000000 --- a/sherpa/python/csrc/rnnt_beam_search.cc +++ /dev/null @@ -1,161 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "sherpa/python/csrc/rnnt_beam_search.h" - -#include -#include -#include - -#include "sherpa/csrc/rnnt_beam_search.h" -#include "torch/torch.h" - -namespace sherpa { - -static constexpr const char *kGreedySearchDoc = R"doc( -RNN-T greedy search decoding by limiting the max symbol per frame to one. - -Note: - It is for offline decoding. See also :func:`streaming_greedy_search` which - is for streaming decoding. - -Args: - model: - The RNN-T model. It can be an instance of its subclass, such as - :class:`RnntConformerModel`. - encoder_out: - Output from the encoder network. Its shape is - ``(batch_size, T, encoder_out_dim)`` and its dtype is ``torch::kFloat``. - It should be on the same device as ``model``. - encoder_out_lens: - A 1-D tensor containing the valid frames before padding in ``encoder_out``. - Its dtype is ``torch.kLong`` and its shape is ``(batch_size,)``. Also, - it must be on CPU. -Returns: - Return A list-of-list of token IDs containing the decoded results. The - returned vector has size ``batch_size`` and each entry contains the - decoded results for the corresponding input in ``encoder_out``. -)doc"; - -static constexpr const char *kStreamingGreedySearchDoc = R"doc( -RNN-T greedy search for streaming recognition. - -Args: - model: - The RNN-T model. It can be an instance of its subclass, such as - :class:`RnntEmformerModel`. - encoder_out: - Output from the encoder network. Its shape is - ``(batch_size, T, encoder_out_dim)`` and its dtype is ``torch::kFloat``. - It should be on the same device as ``model``. - decoder_out: - Output from the decoder network. Its shape is - ``(batch_size, decoder_out_dim)`` and its dtype is ``torch::kFloat``. - It should be on the same device as ``model``. - hyps: - The decoded tokens from the previous chunk. - num_trailing_blank_frames: - Number of trailing blank frames decoded so far. - -Returns: - Return a tuple containing: - - The decoder output for the current chunk. - - The decoded tokens for the current chunk. -)doc"; - -static constexpr const char *kModifiedBeamSearchDoc = R"doc( -RNN-T modified beam search for offline recognition. - -By modified we mean that the maximum symbol per frame is limited to 1. - -Args: - model: - The RNN-T model. It can be an instance of its subclass, such as - :class:`RnntConformerModel`. - encoder_out: - Output from the encoder network. Its shape is - ``(batch_size, T, encoder_out_dim)`` and its dtype is ``torch::kFloat``. - It should be on the same device as ``model``. - encoder_out_lens: - A 1-D tensor containing the valid frames before padding in ``encoder_out``. - Its dtype is ``torch.kLong`` and its shape is ``(batch_size,)``. Also, - it must be on CPU. - num_active_paths - Number of active paths for each utterance. Note: Due to merging paths with - identical token sequences, the actual number of active path for each - utterance may be smaller than this value. -Returns: - Return A list-of-list of token IDs containing the decoded results. The - returned vector has size ``batch_size`` and each entry contains the - decoded results for the corresponding input in ``encoder_out``. -)doc"; - -static constexpr const char *kStreamingModifiedBeamSearchDoc = R"doc( -RNN-T modified beam search for streaming recognition. - -Args: - model: - The RNN-T model. It can be an instance of its subclass, such as - :class:`RnntConformerModel` and :class:`RnntConformerModel`. - encoder_out: - Output from the encoder network. Its shape is - ``(batch_size, T, encoder_out_dim)`` and its dtype is ``torch::kFloat``. - It should be on the same device as ``model``. - hyps: - Decoded results from the previous chunk. - num_active_paths - Number of active paths for each utterance. Note: Due to merging paths with - identical token sequences, the actual number of active path for each - utterance may be smaller than this value. - -Returns: - Decoded results for the next chunk. -)doc"; - -void PybindRnntBeamSearch(py::module &m) { // NOLINT - m.def("greedy_search", &GreedySearch, py::arg("model"), - py::arg("encoder_out"), py::arg("encoder_out_length"), - py::call_guard(), kGreedySearchDoc); - - m.def( - "streaming_greedy_search", - [](RnntModel &model, torch::Tensor encoder_out, torch::Tensor decoder_out, - std::vector> &hyps, - std::vector &num_trailing_blank_frames) - -> std::tuple>, - std::vector> { - decoder_out = StreamingGreedySearch(model, encoder_out, decoder_out, - &hyps, &num_trailing_blank_frames); - return {decoder_out, hyps, num_trailing_blank_frames}; - }, - py::arg("model"), py::arg("encoder_out"), py::arg("decoder_out"), - py::arg("hyps"), py::arg("num_trailing_blank_frames"), - py::call_guard(), kStreamingGreedySearchDoc); - - m.def("modified_beam_search", &ModifiedBeamSearch, py::arg("model"), - py::arg("encoder_out"), py::arg("encoder_out_length"), - py::arg("num_active_paths") = 4, - py::call_guard(), kModifiedBeamSearchDoc); - - m.def("streaming_modified_beam_search", &StreamingModifiedBeamSearch, - py::arg("model"), py::arg("encoder_out"), py::arg("hyps"), - py::arg("num_active_paths") = 4, - py::call_guard(), - kStreamingModifiedBeamSearchDoc); -} - -} // namespace sherpa diff --git a/sherpa/python/csrc/rnnt_conformer_model.cc b/sherpa/python/csrc/rnnt_conformer_model.cc deleted file mode 100644 index 1c48a471c..000000000 --- a/sherpa/python/csrc/rnnt_conformer_model.cc +++ /dev/null @@ -1,56 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/python/csrc/rnnt_conformer_model.h" - -#include -#include - -#include "sherpa/csrc/rnnt_conformer_model.h" -#include "sherpa/csrc/rnnt_model.h" -#include "torch/torch.h" - -namespace sherpa { - -void PybindRnntConformerModel(py::module &m) { // NOLINT - using PyClass = RnntConformerModel; - py::class_(m, "RnntConformerModel") - .def(py::init([](const std::string &filename, - py::object device = py::str("cpu"), - bool optimize_for_inference = - false) -> std::unique_ptr { - std::string device_str = - device.is_none() ? "cpu" : py::str(device); - return std::make_unique( - filename, torch::Device(device_str), optimize_for_inference); - }), - py::arg("filename"), py::arg("device") = py::str("cpu"), - py::arg("optimize_for_inference") = false) - .def("encoder", &PyClass::ForwardEncoder, py::arg("features"), - py::arg("features_length"), py::call_guard()) - .def("encoder_streaming_forward", &PyClass::StreamingForwardEncoder, - py::arg("features"), py::arg("features_length"), py::arg("states"), - py::arg("processed_frames"), py::arg("left_context"), - py::arg("right_context"), py::call_guard()) - .def("get_encoder_init_states", &PyClass::GetEncoderInitStates, - py::arg("left_context"), py::call_guard()) - .def_property_readonly("subsampling_factor", &PyClass::SubSamplingFactor); -} - -} // namespace sherpa diff --git a/sherpa/python/csrc/rnnt_conformer_model.h b/sherpa/python/csrc/rnnt_conformer_model.h deleted file mode 100644 index c812f7dd5..000000000 --- a/sherpa/python/csrc/rnnt_conformer_model.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_PYTHON_CSRC_RNNT_CONFORMER_MODEL_H_ -#define SHERPA_PYTHON_CSRC_RNNT_CONFORMER_MODEL_H_ - -#include "sherpa/python/csrc/sherpa.h" - -namespace sherpa { - -void PybindRnntConformerModel(py::module &m); // NOLINT - -} // namespace sherpa - -#endif // SHERPA_PYTHON_CSRC_RNNT_CONFORMER_MODEL_H_ diff --git a/sherpa/python/csrc/rnnt_conv_emformer_model.cc b/sherpa/python/csrc/rnnt_conv_emformer_model.cc deleted file mode 100644 index 4fc101235..000000000 --- a/sherpa/python/csrc/rnnt_conv_emformer_model.cc +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/python/csrc/rnnt_conv_emformer_model.h" - -#include -#include - -#include "sherpa/csrc/rnnt_conv_emformer_model.h" -#include "sherpa/csrc/rnnt_model.h" -#include "torch/torch.h" - -namespace sherpa { - -void PybindRnntConvEmformerModel(py::module &m) { // NOLINT - using PyClass = RnntConvEmformerModel; - py::class_(m, "RnntConvEmformerModel") - .def(py::init([](const std::string &filename, - py::object device = py::str("cpu"), - bool optimize_for_inference = - false) -> std::unique_ptr { - std::string device_str = - device.is_none() ? "cpu" : py::str(device); - return std::make_unique( - filename, torch::Device(device_str), optimize_for_inference); - }), - py::arg("filename"), py::arg("device") = py::str("cpu"), - py::arg("optimize_for_inference") = false) - .def("encoder_streaming_forward", &PyClass::StreamingForwardEncoder, - py::arg("features"), py::arg("features_length"), - py::arg("num_processed_frames"), py::arg("states"), - py::call_guard()) - .def("get_encoder_init_states", &PyClass::GetEncoderInitStates, - py::call_guard()) - .def_property_readonly("chunk_length", &PyClass::ChunkLength) - .def_property_readonly("right_context_length", - &PyClass::RightContextLength) - .def_property_readonly("pad_length", &PyClass::PadLength); -} - -} // namespace sherpa diff --git a/sherpa/python/csrc/rnnt_conv_emformer_model.h b/sherpa/python/csrc/rnnt_conv_emformer_model.h deleted file mode 100644 index 9128d9b46..000000000 --- a/sherpa/python/csrc/rnnt_conv_emformer_model.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_PYTHON_CSRC_RNNT_CONV_EMFORMER_MODEL_H_ -#define SHERPA_PYTHON_CSRC_RNNT_CONV_EMFORMER_MODEL_H_ - -#include "sherpa/python/csrc/sherpa.h" - -namespace sherpa { - -void PybindRnntConvEmformerModel(py::module &m); // NOLINT - -} // namespace sherpa - -#endif // SHERPA_PYTHON_CSRC_RNNT_CONV_EMFORMER_MODEL_H_ diff --git a/sherpa/python/csrc/rnnt_emformer_model.cc b/sherpa/python/csrc/rnnt_emformer_model.cc deleted file mode 100644 index b4099f169..000000000 --- a/sherpa/python/csrc/rnnt_emformer_model.cc +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/python/csrc/rnnt_emformer_model.h" - -#include -#include - -#include "sherpa/csrc/rnnt_emformer_model.h" -#include "sherpa/csrc/rnnt_model.h" -#include "torch/torch.h" - -namespace sherpa { - -void PybindRnntEmformerModel(py::module &m) { // NOLINT - using PyClass = RnntEmformerModel; - py::class_(m, "RnntEmformerModel") - .def(py::init([](const std::string &filename, - py::object device = py::str("cpu"), - bool optimize_for_inference = - false) -> std::unique_ptr { - std::string device_str = - device.is_none() ? "cpu" : py::str(device); - return std::make_unique( - filename, torch::Device(device_str), optimize_for_inference); - }), - py::arg("filename"), py::arg("device") = py::str("cpu"), - py::arg("optimize_for_inference") = false) - .def("encoder_streaming_forward", &PyClass::StreamingForwardEncoder, - py::arg("features"), py::arg("features_length"), - py::arg("states") = py::none(), - py::call_guard()) - .def("get_encoder_init_states", &PyClass::GetEncoderInitStates, - py::call_guard()) - .def_property_readonly("segment_length", &PyClass::SegmentLength) - .def_property_readonly("vocab_size", &PyClass::VocabSize) - .def_property_readonly("right_context_length", - &PyClass::RightContextLength); -} - -} // namespace sherpa diff --git a/sherpa/python/csrc/rnnt_emformer_model.h b/sherpa/python/csrc/rnnt_emformer_model.h deleted file mode 100644 index f2201f987..000000000 --- a/sherpa/python/csrc/rnnt_emformer_model.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_PYTHON_CSRC_RNNT_EMFORMER_MODEL_H_ -#define SHERPA_PYTHON_CSRC_RNNT_EMFORMER_MODEL_H_ - -#include "sherpa/python/csrc/sherpa.h" - -namespace sherpa { - -void PybindRnntEmformerModel(py::module &m); // NOLINT - -} // namespace sherpa - -#endif // SHERPA_PYTHON_CSRC_RNNT_EMFORMER_MODEL_H_ diff --git a/sherpa/python/csrc/rnnt_model.cc b/sherpa/python/csrc/rnnt_model.cc deleted file mode 100644 index 19912d429..000000000 --- a/sherpa/python/csrc/rnnt_model.cc +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sherpa/python/csrc/rnnt_model.h" - -#include -#include - -#include "sherpa/csrc/rnnt_model.h" -#include "torch/torch.h" - -namespace sherpa { - -void PybindRnntModel(py::module &m) { // NOLINT - using PyClass = RnntModel; - py::class_(m, "RnntModel") - .def("decoder_forward", &PyClass::ForwardDecoder, - py::arg("decoder_input"), py::call_guard()) - .def("joiner_forward", &PyClass::ForwardJoiner, - py::arg("projected_encoder_out"), py::arg("projected_decoder_out"), - py::call_guard()) - .def("forward_decoder_proj", &PyClass::ForwardDecoderProj, - py::arg("decoder_out"), py::call_guard()) - .def("forward_encoder_proj", &PyClass::ForwardEncoderProj, - py::arg("encoder_out"), py::call_guard()) - .def_property_readonly("device", - [](const PyClass &self) -> py::object { - py::object ans = - py::module_::import("torch").attr("device"); - return ans(self.Device().str()); - }) - .def_property_readonly("blank_id", &PyClass::BlankId) - .def_property_readonly("unk_id", &PyClass::UnkId) - .def_property_readonly("vocab_size", &PyClass::VocabSize) - .def_property_readonly("context_size", &PyClass::ContextSize) - .def_property_readonly("subsampling_factor", &PyClass::SubsamplingFactor); -} - -} // namespace sherpa diff --git a/sherpa/python/csrc/rnnt_model.h b/sherpa/python/csrc/rnnt_model.h deleted file mode 100644 index 21907e155..000000000 --- a/sherpa/python/csrc/rnnt_model.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang, - * Wei Kang) - * - * See LICENSE for clarification regarding multiple authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef SHERPA_PYTHON_CSRC_RNNT_MODEL_H_ -#define SHERPA_PYTHON_CSRC_RNNT_MODEL_H_ - -#include "sherpa/python/csrc/sherpa.h" - -namespace sherpa { - -void PybindRnntModel(py::module &m); // NOLINT - -} // namespace sherpa - -#endif // SHERPA_PYTHON_CSRC_RNNT_MODEL_H_ diff --git a/sherpa/python/csrc/sherpa.cc b/sherpa/python/csrc/sherpa.cc index 70aeb2bd8..4f083c52a 100644 --- a/sherpa/python/csrc/sherpa.cc +++ b/sherpa/python/csrc/sherpa.cc @@ -21,25 +21,40 @@ #include #include "sherpa/csrc/version.h" -#include "sherpa/python/csrc/hypothesis.h" -#include "sherpa/python/csrc/rnnt_beam_search.h" -#include "sherpa/python/csrc/rnnt_conformer_model.h" -#include "sherpa/python/csrc/rnnt_conv_emformer_model.h" -#include "sherpa/python/csrc/rnnt_emformer_model.h" -#include "sherpa/python/csrc/rnnt_model.h" +#include "sherpa/python/csrc/endpoint.h" +#include "sherpa/python/csrc/fast-beam-search-config.h" +#include "sherpa/python/csrc/feature-config.h" +#include "sherpa/python/csrc/offline-ctc-model.h" +#include "sherpa/python/csrc/offline-recognizer.h" +#include "sherpa/python/csrc/offline-stream.h" +#include "sherpa/python/csrc/online-recognizer.h" +#include "sherpa/python/csrc/online-stream.h" +#include "sherpa/python/csrc/resample.h" +#include "sherpa/python/csrc/speaker-embedding-extractor.h" +#include "sherpa/python/csrc/voice-activity-detector.h" namespace sherpa { PYBIND11_MODULE(_sherpa, m) { m.doc() = "pybind11 binding of sherpa"; m.attr("cxx_flags") = std::string(kCMakeCxxFlags); + auto torch = py::module::import("torch"); + auto kaldifeat = py::module::import("_kaldifeat"); + (void)kaldifeat.attr("FbankOptions"); - PybindHypothesis(m); - PybindRnntModel(m); - PybindRnntConformerModel(m); - PybindRnntConvEmformerModel(m); - PybindRnntEmformerModel(m); - PybindRnntBeamSearch(m); + PybindResample(m); + + PybindFeatureConfig(m); + PybindFastBeamSearch(m); + PybindOfflineCtcModel(m); + PybindOfflineStream(m); + PybindOfflineRecognizer(m); + PybindEndpoint(m); + PybindOnlineStream(m); + PybindOnlineRecognizer(m); + PybindVoiceActivityDetector(&m); + + PybindSpeakerEmbeddingExtractor(&m); } } // namespace sherpa diff --git a/sherpa/python/csrc/sherpa.h b/sherpa/python/csrc/sherpa.h index 74171da4a..3e9d923ca 100644 --- a/sherpa/python/csrc/sherpa.h +++ b/sherpa/python/csrc/sherpa.h @@ -20,6 +20,7 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" +#include "torch/torch.h" namespace py = pybind11; diff --git a/sherpa/python/csrc/silero-vad-model-config.cc b/sherpa/python/csrc/silero-vad-model-config.cc new file mode 100644 index 000000000..1ab56edbe --- /dev/null +++ b/sherpa/python/csrc/silero-vad-model-config.cc @@ -0,0 +1,28 @@ +// sherpa/python/csrc/silero-vad-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/python/csrc/silero-vad-model-config.h" + +#include + +#include "sherpa/csrc/silero-vad-model-config.h" + +namespace sherpa { + +void PybindSileroVadModelConfig(py::module *m) { + using PyClass = SileroVadModelConfig; + py::class_(*m, "SileroVadModelConfig") + .def(py::init(), + py::arg("model") = "", py::arg("threshold") = 0.5, + py::arg("min_silence_duration") = 0.5, + py::arg("min_speech_duration") = 0.25) + .def_readwrite("model", &PyClass::model) + .def_readwrite("threshold", &PyClass::threshold) + .def_readwrite("min_silence_duration", &PyClass::min_silence_duration) + .def_readwrite("min_speech_duration", &PyClass::min_speech_duration) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/silero-vad-model-config.h b/sherpa/python/csrc/silero-vad-model-config.h new file mode 100644 index 000000000..a1e1bcc6c --- /dev/null +++ b/sherpa/python/csrc/silero-vad-model-config.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/silero-vad-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindSileroVadModelConfig(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_ diff --git a/sherpa/python/csrc/speaker-embedding-extractor.cc b/sherpa/python/csrc/speaker-embedding-extractor.cc new file mode 100644 index 000000000..515eae3ab --- /dev/null +++ b/sherpa/python/csrc/speaker-embedding-extractor.cc @@ -0,0 +1,49 @@ +// sherpa/python/csrc/speaker-embedding-extractor.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa/python/csrc/speaker-embedding-extractor.h" + +#include +#include + +#include "sherpa/csrc/speaker-embedding-extractor.h" + +namespace sherpa { + +static void PybindSpeakerEmbeddingExtractorConfig(py::module *m) { + using PyClass = SpeakerEmbeddingExtractorConfig; + py::class_(*m, "SpeakerEmbeddingExtractorConfig") + .def(py::init<>()) + .def(py::init(), py::arg("model"), + py::arg("use_gpu") = false, py::arg("debug") = false) + .def_readwrite("model", &PyClass::model) + .def_readwrite("use_gpu", &PyClass::use_gpu) + .def_readwrite("debug", &PyClass::debug) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +void PybindSpeakerEmbeddingExtractor(py::module *m) { + PybindSpeakerEmbeddingExtractorConfig(m); + + using PyClass = SpeakerEmbeddingExtractor; + py::class_(*m, "SpeakerEmbeddingExtractor") + .def(py::init(), + py::arg("config"), py::call_guard()) + .def_property_readonly("dim", &PyClass::Dim) + .def("create_stream", &PyClass::CreateStream, + py::call_guard()) + .def( + "compute", + [](PyClass &self, OfflineStream *s) { return self.Compute(s); }, + py::arg("s"), py::call_guard()) + .def( + "compute", + [](PyClass &self, std::vector &ss) { + return self.Compute(ss.data(), ss.size()); + }, + py::arg("ss"), py::call_guard()); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/speaker-embedding-extractor.h b/sherpa/python/csrc/speaker-embedding-extractor.h new file mode 100644 index 000000000..d72123ed0 --- /dev/null +++ b/sherpa/python/csrc/speaker-embedding-extractor.h @@ -0,0 +1,16 @@ +// sherpa/python/csrc/speaker-embedding-extractor.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_PYTHON_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ +#define SHERPA_PYTHON_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindSpeakerEmbeddingExtractor(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_SPEAKER_EMBEDDING_EXTRACTOR_H_ diff --git a/sherpa/python/csrc/vad-model-config.cc b/sherpa/python/csrc/vad-model-config.cc new file mode 100644 index 000000000..acab78377 --- /dev/null +++ b/sherpa/python/csrc/vad-model-config.cc @@ -0,0 +1,27 @@ +// sherpa/python/csrc/vad-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/python/csrc/vad-model-config.h" + +#include "sherpa/csrc/vad-model-config.h" +#include "sherpa/python/csrc/silero-vad-model-config.h" + +namespace sherpa { + +void PybindVadModelConfig(py::module *m) { + PybindSileroVadModelConfig(m); + using PyClass = VadModelConfig; + + py::class_(*m, "VadModelConfig") + .def(py::init(), + py::arg("silero_vad"), py::arg("sample_rate") = 16000, + py::arg("use_gpu") = false, py::arg("debug") = false) + .def_readwrite("silero_vad", &PyClass::silero_vad) + .def_readwrite("sample_rate", &PyClass::sample_rate) + .def_readwrite("use_gpu", &PyClass::use_gpu) + .def_readwrite("debug", &PyClass::debug) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/vad-model-config.h b/sherpa/python/csrc/vad-model-config.h new file mode 100644 index 000000000..467f7d614 --- /dev/null +++ b/sherpa/python/csrc/vad-model-config.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/vad-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_VAD_MODEL_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_VAD_MODEL_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindVadModelConfig(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_VAD_MODEL_CONFIG_H_ diff --git a/sherpa/python/csrc/voice-activity-detector-config.cc b/sherpa/python/csrc/voice-activity-detector-config.cc new file mode 100644 index 000000000..991ad4d45 --- /dev/null +++ b/sherpa/python/csrc/voice-activity-detector-config.cc @@ -0,0 +1,25 @@ +// sherpa/python/csrc/voice-activity-detector-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/python/csrc/voice-activity-detector-config.h" + +#include "sherpa/csrc/voice-activity-detector.h" +#include "sherpa/python/csrc/vad-model-config.h" + +namespace sherpa { + +void PybindVoiceActivityDetectorConfig(py::module *m) { + PybindVadModelConfig(m); + using PyClass = VoiceActivityDetectorConfig; + + py::class_(*m, "VoiceActivityDetectorConfig") + .def(py::init(), py::arg("model"), + py::arg("segment_size") = 10, py::arg("batch_size") = 2) + .def_readwrite("model", &PyClass::model) + .def_readwrite("segment_size", &PyClass::segment_size) + .def_readwrite("batch_size", &PyClass::batch_size) + .def("validate", &PyClass::Validate) + .def("__str__", &PyClass::ToString); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/voice-activity-detector-config.h b/sherpa/python/csrc/voice-activity-detector-config.h new file mode 100644 index 000000000..94f9ce74c --- /dev/null +++ b/sherpa/python/csrc/voice-activity-detector-config.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/voice-activity-detector-config.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_CONFIG_H_ +#define SHERPA_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_CONFIG_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindVoiceActivityDetectorConfig(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_CONFIG_H_ diff --git a/sherpa/python/csrc/voice-activity-detector.cc b/sherpa/python/csrc/voice-activity-detector.cc new file mode 100644 index 000000000..d65e9d7fa --- /dev/null +++ b/sherpa/python/csrc/voice-activity-detector.cc @@ -0,0 +1,42 @@ +// sherpa/python/csrc/voice-activity-detector.cc +// +// Copyright (c) 2025 Xiaomi Corporation +#include "sherpa/python/csrc/voice-activity-detector.h" + +#include + +#include "sherpa/csrc/voice-activity-detector.h" +#include "sherpa/python/csrc/voice-activity-detector-config.h" +#include "torch/torch.h" + +namespace sherpa { + +void PybindSpeechSegment(py::module *m) { + using PyClass = SpeechSegment; + py::class_(*m, "SpeechSegment") + .def_property_readonly("start", + [](const PyClass &self) { return self.start; }) + .def_property_readonly("end", + [](const PyClass &self) { return self.end; }) + .def("__str__", [](const PyClass &self) { + std::ostringstream os; + os << "SpeechSegment("; + os << std::fixed << std::setprecision(3) << self.start << ", "; + os << std::fixed << std::setprecision(3) << self.end << ")"; + return os.str(); + }); +} + +void PybindVoiceActivityDetector(py::module *m) { + PybindVoiceActivityDetectorConfig(m); + PybindSpeechSegment(m); + + using PyClass = VoiceActivityDetector; + py::class_(*m, "VoiceActivityDetector") + .def(py::init(), py::arg("config")) + .def_property_readonly("config", &PyClass::GetConfig) + .def("process", &PyClass::Process, py::arg("samples"), + py::call_guard()); +} + +} // namespace sherpa diff --git a/sherpa/python/csrc/voice-activity-detector.h b/sherpa/python/csrc/voice-activity-detector.h new file mode 100644 index 000000000..acf71334b --- /dev/null +++ b/sherpa/python/csrc/voice-activity-detector.h @@ -0,0 +1,15 @@ +// sherpa/python/csrc/voice-activity-detector.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_ +#define SHERPA_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_ + +#include "sherpa/python/csrc/sherpa.h" + +namespace sherpa { + +void PybindVoiceActivityDetector(py::module *m); + +} + +#endif // SHERPA_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_ diff --git a/sherpa/python/sherpa/__init__.py b/sherpa/python/sherpa/__init__.py index 33b1081c7..13fb0c278 100644 --- a/sherpa/python/sherpa/__init__.py +++ b/sherpa/python/sherpa/__init__.py @@ -1,3 +1,4 @@ +import kaldifeat import torch from .torch_version import sherpa_torch_version @@ -9,33 +10,30 @@ ) from _sherpa import ( - Hypotheses, - Hypothesis, - RnntConformerModel, - RnntConvEmformerModel, - RnntEmformerModel, + EndpointConfig, + EndpointRule, + FastBeamSearchConfig, + FeatureConfig, + LinearResample, + OfflineCtcDecoderConfig, + OfflineModelConfig, + OfflineRecognizer, + OfflineRecognizerConfig, + OfflineSenseVoiceModelConfig, + OfflineStream, + OfflineWhisperModelConfig, + OnlineRecognitionResult, + OnlineRecognizer, + OnlineRecognizerConfig, + OnlineStream, + SileroVadModelConfig, + SpeakerEmbeddingExtractor, + SpeakerEmbeddingExtractorConfig, + VadModelConfig, + VoiceActivityDetector, + VoiceActivityDetectorConfig, cxx_flags, - greedy_search, - modified_beam_search, - streaming_greedy_search, - streaming_modified_beam_search, ) -from .decode import ( - VALID_FAST_BEAM_SEARCH_METHOD, - fast_beam_search_nbest, - fast_beam_search_nbest_LG, - fast_beam_search_one_best, -) -from .lexicon import Lexicon -from .nbest import Nbest -from .online_endpoint import ( - OnlineEndpointConfig, - add_online_endpoint_arguments, - endpoint_detected, -) -from .utils import ( - add_beam_search_arguments, - count_num_trailing_zeros, - get_texts_and_num_trailing_blanks, -) +from .http_server import HttpServer +from .utils import encode_contexts, setup_logger, str2bool diff --git a/sherpa/python/sherpa/__init__.pyi b/sherpa/python/sherpa/__init__.pyi new file mode 100644 index 000000000..57830b62d --- /dev/null +++ b/sherpa/python/sherpa/__init__.pyi @@ -0,0 +1,246 @@ +from dataclasses import dataclass +from typing import List, overload + +import kaldifeat +import torch + +class EndpointRule: + @overload + def __init__(self): ... + @overload + def __init__( + self, + must_contain_nonsilence=True, + min_trailing_silence=2.0, + min_utterance_length=0.0, + ): ... + + must_contain_nonsilence: bool + min_trailing_silence: float + min_utterance_length: float + +class EndpointConfig: + @overload + def __init__(self): ... + @overload + def __init__( + self, rule1=EndpointRule(), rule2=EndpointRule(), rule3=EndpointRule() + ): ... + + rule1: EndpointRule + rule2: EndpointRule + rule3: EndpointRule + +class FastBeamSearchConfig: + @overload + def __init__(self): ... + @overload + def __init__( + self, + lg="", + ngram_lm_scale=0.01, + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=False, + ): ... + + lg: str + ngram_lm_scale: float + beam: float + max_states: int + max_contexts: int + allow_partial: bool + +@dataclass +class FeatureConfig: + @overload + def __init__(self): ... + @overload + def __init__( + self, + fbank_opts=kaldifeat.FbankOptions(), + normalize_samples=True, + return_waveform=False, + nemo_normalize="", + ): ... + + fbank_opts: kaldifeat.FbankOptions + normalize_samples: bool + return_waveform: bool + nemo_normalize: str + +class Hypothesis: + timestamps: List + num_trailing_blanks: int + + @property + def key(self) -> str: ... + @property + def log_prob(self) -> float: ... + @property + def ys(self) -> List[int]: ... + +class Hypotheses: + def get_most_probable(self, length_norm: bool) -> Hypothesis: ... + +class LinearResample: + def reset(self) -> None: ... + def resample(self) -> torch.Tensor: ... + + input_sample_rate: int + output_sample_rate: int + +class OfflineCtcDecoderConfig: + @overload + def __init__(self): ... + @overload + def __init__( + self, + modified=True, + hlg="", + search_beam=20, + output_beam=8, + min_active_states=30, + max_active_states=10000, + lm_scale=1.0, + ): ... + + modified: bool + hlg: str + search_beam: float + output_beam: float + min_active_states: int + max_active_states: int + lm_scale: float + + def validate(self) -> None: ... + +class OfflineRecognizerConfig: + @overload + def __init__(self): ... + @overload + def __init__( + self, + nn_model, + tokens, + use_gpu=False, + decoding_method="greedy_search", + num_active_paths=4, + context_score=1.5, + ctc_decoder_config=OfflineCtcDecoderConfig(), + feat_config=FeatureConfig(), + fast_beam_search_config=FastBeamSearchConfig(), + ): ... + + ctc_decoder_config: OfflineCtcDecoderConfig + feat_config: FeatureConfig + fast_beam_search_config: FastBeamSearchConfig + nn_model: str + tokens: str + use_gpu: bool + decoding_method: str + num_active_paths: int + context_score: float + + def validate(self) -> None: ... + +class OfflineRecognitionResult: + text: str + tokens: List[str] + timestamps: List[float] + + def as_json_string(self) -> str: ... + +class OfflineStream: + def accept_wave_file(self, filename: str) -> None: ... + @overload + def accept_samples(self, samples: List[float]) -> None: ... + @overload + def accept_samples(self, samples: torch.Tensor) -> None: ... + @property + def result(self) -> OfflineRecognitionResult: ... + accept_waveform = accept_samples + +class OfflineRecognizer: + def __init__(self, config: OfflineRecognizerConfig) -> None: ... + @overload + def create_stream(self) -> OfflineStream: ... + @overload + def create_stream( + self, contexts_list: List[List[int]] + ) -> OfflineStream: ... + def decode_stream(self, s: OfflineStream) -> None: ... + def decode_streams(self, ss: List[OfflineStream]) -> None: ... + +class OnlineRecognizerConfig: + @overload + def __init__(self): ... + @overload + def __init__( + self, + nn_model, + tokens, + encoder_model="", + decoder_model="", + joiner_model="", + use_gpu=False, + use_endpoint=False, + decoding_method="greedy_search", + num_active_paths=4, + left_context=64, + right_context=0, + chunk_size=12, + feat_config=FeatureConfig(), + endpoint_config=EndpointConfig(), + fast_beam_search_config=FastBeamSearchConfig(), + ): ... + feat_config: FeatureConfig + endpoint_config: EndpointConfig + fast_beam_search_config: FastBeamSearchConfig + nn_model: str + tokens: str + encoder_model: str + decoder_model: str + joiner_model: str + use_gpu: bool + use_endpoint: bool + decoding_method: str + num_active_paths: int + left_context: int + right_context: int + chunk_size: int + + def validate(self) -> None: ... + +class OnlineRecognitionResult: + @property + def text(self) -> str: ... + @property + def tokens(self) -> List[str]: ... + @property + def timestamps(self) -> float: ... + @property + def segment(self) -> int: ... + @property + def start_time(self) -> float: ... + @property + def is_final(self) -> bool: ... + def as_json_string(self) -> str: ... + +class OnlineStream: + def accept_waveform( + self, sampling_rate: int, waveform: torch.Tensor + ) -> None: ... + def input_finished(self) -> None: ... + +class OnlineRecognizer: + def __init__(self, config: OnlineRecognizerConfig): ... + def create_stream(self) -> OnlineStream: ... + def is_ready(self, s: OnlineStream) -> bool: ... + def is_endpoint(self, s: OnlineStream) -> bool: ... + def decode_stream(self, s: OnlineStream) -> bool: ... + def decode_streams(self, ss: List[OnlineStream]) -> None: ... + def get_result(self, s: OnlineStream) -> OnlineRecognitionResult: ... + @property + def config(self) -> OnlineRecognizerConfig: ... diff --git a/sherpa/python/sherpa/decode.py b/sherpa/python/sherpa/decode.py deleted file mode 100644 index 782609c5f..000000000 --- a/sherpa/python/sherpa/decode.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Wei Kang) -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Tuple - -import k2 -import torch -from _sherpa import RnntModel - -from .nbest import Nbest -from .utils import get_texts_and_num_trailing_blanks - -VALID_FAST_BEAM_SEARCH_METHOD = [ - "fast_beam_search_nbest_LG", - "fast_beam_search_nbest", - "fast_beam_search", -] - - -def fast_beam_search_nbest_LG( - model: RnntModel, - encoder_out: torch.Tensor, - processed_lens: torch.Tensor, - rnnt_decoding_config: k2.RnntDecodingConfig, - rnnt_decoding_streams_list: List[k2.RnntDecodingStream], - num_paths: int, - nbest_scale: float = 0.5, - use_double_scores: bool = True, - temperature: float = 1.0, -) -> Tuple[List[List[int]], List[int]]: - """It limits the maximum number of symbols per frame to 1. - - The process to get the results is: - - (1) Use fast beam search to get a lattice - - (2) Select `num_paths` paths from the lattice using k2.random_paths() - - (3) Unique the selected paths - - (4) Intersect the selected paths with the lattice and compute the - shortest path from the intersection result - - (5) The path with the largest score is used as the decoding output. - - Args: - model: - An instance of `Transducer`. - encoder_out: - A tensor of shape (N, T, C) from the encoder. - processed_lens: - A 1-D tensor containing the valid frames before padding that have been - processed by encoder network until now. For offline recognition, it equals - to ``encoder_out_lens`` of encoder outputs. For online recognition, it is - the cumulative sum of ``encoder_out_lens`` of previous chunks (including - current chunk). Its dtype is `torch.kLong` and its shape is `(batch_size,)`. - rnnt_decoding_config: - The configuration of Fsa based RNN-T decoding, refer to - https://k2-fsa.github.io/k2/python_api/api.html#rnntdecodingconfig for more - details. - rnnt_decoding_streams_list: - A list containing the RnntDecodingStream for each sequences, its size is - ``encoder_out.size(0)``. It stores the decoding graph, internal decoding - states and partial results. - num_paths: - Number of paths to extract from the decoded lattice. - nbest_scale: - It's the scale applied to the lattice.scores. A smaller value - yields more unique paths. - use_double_scores: - True to use double precision for computation. False to use - single precision. - temperature: - Softmax temperature. - Returns: - Return a tuple containing: - - the decoded result - - number of trailing blanks - """ - - lattice = fast_beam_search( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - temperature=temperature, - ) - - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=num_paths, - use_double_scores=use_double_scores, - nbest_scale=nbest_scale, - ) - - # The following code is modified from nbest.intersect() - word_fsa = k2.invert(nbest.fsa) - if hasattr(lattice, "aux_labels"): - # delete token IDs as it is not needed - del word_fsa.aux_labels - word_fsa.scores.zero_() - word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa) - path_to_utt_map = nbest.shape.row_ids(1) - - if hasattr(lattice, "aux_labels"): - # lattice has token IDs as labels and word IDs as aux_labels. - # inv_lattice has word IDs as labels and token IDs as aux_labels - inv_lattice = k2.invert(lattice) - inv_lattice = k2.arc_sort(inv_lattice) - else: - inv_lattice = k2.arc_sort(lattice) - - if inv_lattice.shape[0] == 1: - path_lattice = k2.intersect_device( - inv_lattice, - word_fsa_with_epsilon_loops, - b_to_a_map=torch.zeros_like(path_to_utt_map), - sorted_match_a=True, - ) - else: - path_lattice = k2.intersect_device( - inv_lattice, - word_fsa_with_epsilon_loops, - b_to_a_map=path_to_utt_map, - sorted_match_a=True, - ) - - # path_lattice has word IDs as labels and token IDs as aux_labels - path_lattice = k2.top_sort(k2.connect(path_lattice)) - tot_scores = path_lattice.get_tot_scores( - use_double_scores=use_double_scores, - log_semiring=True, # Note: we always use True - ) - # See https://github.com/k2-fsa/icefall/pull/420 for why - # we always use log_semiring=True - - ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores) - best_hyp_indexes = ragged_tot_scores.argmax() - best_path = k2.index_fsa(nbest.fsa, best_hyp_indexes) - - hyps, num_trailing_blanks = get_texts_and_num_trailing_blanks(best_path) - return hyps, num_trailing_blanks - - -def fast_beam_search_nbest( - model: RnntModel, - encoder_out: torch.Tensor, - processed_lens: torch.Tensor, - rnnt_decoding_config: k2.RnntDecodingConfig, - rnnt_decoding_streams_list: List[k2.RnntDecodingStream], - num_paths: int, - nbest_scale: float = 0.5, - use_double_scores: bool = True, - temperature: float = 1.0, -) -> Tuple[List[List[int]], List[int]]: - """It limits the maximum number of symbols per frame to 1. - - The process to get the results is: - - (1) Use fast beam search to get a lattice - - (2) Select `num_paths` paths from the lattice using k2.random_paths() - - (3) Unique the selected paths - - (4) Intersect the selected paths with the lattice and compute the - shortest path from the intersection result - - (5) The path with the largest score is used as the decoding output. - - Args: - model: - An instance of `Transducer`. - encoder_out: - A tensor of shape (N, T, C) from the encoder. - processed_lens: - A 1-D tensor containing the valid frames before padding that have been - processed by encoder network until now. For offline recognition, it equals - to ``encoder_out_lens`` of encoder outputs. For online recognition, it is - the cumulative sum of ``encoder_out_lens`` of previous chunks (including - current chunk). Its dtype is `torch.kLong` and its shape is `(batch_size,)`. - rnnt_decoding_config: - The configuration of Fsa based RNN-T decoding, refer to - https://k2-fsa.github.io/k2/python_api/api.html#rnntdecodingconfig for more - details. - rnnt_decoding_streams_list: - A list containing the RnntDecodingStream for each sequences, its size is - ``encoder_out.size(0)``. It stores the decoding graph, internal decoding - states and partial results. - num_paths: - Number of paths to extract from the decoded lattice. - nbest_scale: - It's the scale applied to the lattice.scores. A smaller value - yields more unique paths. - use_double_scores: - True to use double precision for computation. False to use - single precision. - temperature: - Softmax temperature. - Returns: - Return a tuple containing: - - the decoded result - - number of trailing blanks - """ - - lattice = fast_beam_search( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - temperature=temperature, - ) - - nbest = Nbest.from_lattice( - lattice=lattice, - num_paths=num_paths, - use_double_scores=use_double_scores, - nbest_scale=nbest_scale, - ) - - # at this point, nbest.fsa.scores are all zeros. - nbest = nbest.intersect(lattice) - # Now nbest.fsa.scores contains acoustic scores - - max_indexes = nbest.tot_scores().argmax() - - best_path = k2.index_fsa(nbest.fsa, max_indexes) - - hyps, num_trailing_blanks = get_texts_and_num_trailing_blanks(best_path) - return hyps, num_trailing_blanks - - -def fast_beam_search_one_best( - model: RnntModel, - encoder_out: torch.Tensor, - processed_lens: torch.Tensor, - rnnt_decoding_config: k2.RnntDecodingConfig, - rnnt_decoding_streams_list: List[k2.RnntDecodingStream], - temperature: float = 1.0, -) -> Tuple[List[List[int]], List[int]]: - """It limits the maximum number of symbols per frame to 1. - - A lattice is first obtained using fast beam search, and then - the shortest path within the lattice is used as the final output. - - Args: - model: - An instance of `Transducer`. - encoder_out: - A tensor of shape (N, T, C) from the encoder. - processed_lens: - A 1-D tensor containing the valid frames before padding that have been - processed by encoder network until now. For offline recognition, it equals - to ``encoder_out_lens`` of encoder outputs. For online recognition, it is - the cumulative sum of ``encoder_out_lens`` of previous chunks (including - current chunk). Its dtype is `torch.kLong` and its shape is `(batch_size,)`. - rnnt_decoding_config: - The configuration of Fsa based RNN-T decoding, refer to - https://k2-fsa.github.io/k2/python_api/api.html#rnntdecodingconfig for more - details. - rnnt_decoding_streams_list: - A list containing the RnntDecodingStream for each sequences, its size is - ``encoder_out.size(0)``. It stores the decoding graph, internal decoding - states and partial results. - temperature: - Softmax temperature. - Returns: - Return a tuple containing: - - the decoded result - - number of trailing blanks - """ - lattice = fast_beam_search( - model=model, - encoder_out=encoder_out, - processed_lens=processed_lens, - rnnt_decoding_config=rnnt_decoding_config, - rnnt_decoding_streams_list=rnnt_decoding_streams_list, - temperature=temperature, - ) - - best_path = one_best_decoding(lattice) - - hyps, num_trailing_blanks = get_texts_and_num_trailing_blanks(best_path) - return hyps, num_trailing_blanks - - -def fast_beam_search( - model: RnntModel, - encoder_out: torch.Tensor, - processed_lens: torch.Tensor, - rnnt_decoding_config: k2.RnntDecodingConfig, - rnnt_decoding_streams_list: List[k2.RnntDecodingStream], - temperature: float = 1.0, -) -> k2.Fsa: - """It limits the maximum number of symbols per frame to 1. - - Args: - model: - An instance of `Transducer`. - decoding_graph: - Decoding graph used for decoding, may be a TrivialGraph or a LG. - encoder_out: - A tensor of shape (N, T, C) from the encoder. - processed_lens: - A 1-D tensor containing the valid frames before padding that have been - processed by encoder network until now. For offline recognition, it equals - to ``encoder_out_lens`` of encoder outputs. For online recognition, it is - the cumulative sum of ``encoder_out_lens`` of previous chunks (including - current chunk). Its dtype is `torch.kLong` and its shape is `(batch_size,)`. - rnnt_decoding_config: - The configuration of Fsa based RNN-T decoding, refer to - https://k2-fsa.github.io/k2/python_api/api.html#rnntdecodingconfig for more - details. - rnnt_decoding_streams_list: - A list containing the RnntDecodingStream for each sequences, its size is - ``encoder_out.size(0)``. It stores the decoding graph, internal decoding - states and partial results. - temperature: - Softmax temperature. - Returns: - Return an FsaVec with axes [utt][state][arc] containing the decoded - lattice. Note: When the input graph is a TrivialGraph, the returned - lattice is actually an acceptor. - """ - assert encoder_out.ndim == 3 - - B, T, C = encoder_out.shape - - decoding_streams = k2.RnntDecodingStreams( - rnnt_decoding_streams_list, rnnt_decoding_config - ) - - encoder_out = model.forward_encoder_proj(encoder_out) - - for t in range(T): - # shape is a RaggedShape of shape (B, context) - # contexts is a Tensor of shape (shape.NumElements(), context_size) - shape, contexts = decoding_streams.get_contexts() - # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64 - contexts = contexts.to(torch.int64) - # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim) - decoder_out = model.decoder_forward(contexts) - decoder_out = model.forward_decoder_proj(decoder_out).squeeze(1) - # current_encoder_out is of shape - # (shape.NumElements(), joiner_dim) - # fmt: off - current_encoder_out = torch.index_select( - encoder_out[:, t], 0, shape.row_ids(1).to(torch.int64) - ) - # fmt: on - logits = model.joiner_forward( - current_encoder_out, - decoder_out, - ) - log_probs = (logits / temperature).log_softmax(dim=-1) - decoding_streams.advance(log_probs) - decoding_streams.terminate_and_flush_to_streams() - lattice = decoding_streams.format_output(processed_lens.tolist()) - - return lattice - - -def one_best_decoding( - lattice: k2.Fsa, - use_double_scores: bool = True, -) -> k2.Fsa: - """Get the best path from a lattice. - - Args: - lattice: - The decoding lattice returned by :func:`get_lattice`. - use_double_scores: - True to use double precision floating point in the computation. - False to use single precision. - Return: - An FsaVec containing linear paths. - """ - best_path = k2.shortest_path(lattice, use_double_scores=use_double_scores) - return best_path diff --git a/sherpa/python/sherpa/http_server.py b/sherpa/python/sherpa/http_server.py new file mode 100644 index 000000000..b67154ff1 --- /dev/null +++ b/sherpa/python/sherpa/http_server.py @@ -0,0 +1,82 @@ +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Tuple + +# Please sort it alphabetically +_static_files = ( + ("/css/bootstrap.min.css", "text/css"), + ("/css/bootstrap.min.css.map", "text/css"), + ("/index.html", "text/html"), + ("/js/bootstrap.min.js", "application/javascript"), + ("/js/bootstrap.min.js.map", "application/javascript"), + ("/js/jquery-3.6.0.min.js", "application/javascript"), + ("/js/offline_record.js", "application/javascript"), + ("/js/offline_record.js", "application/javascript"), + ("/js/popper.min.js", "application/javascript"), + ("/js/popper.min.js.map", "application/javascript"), + ("/js/streaming_record.js", "application/javascript"), + ("/js/upload.js", "application/javascript"), + ("/k2-logo.png", "image/png"), + ("/nav-partial.html", "text/html"), + ("/offline_record.html", "text/html"), + ("/streaming_record.html", "text/html"), + ("/upload.html", "text/html"), +) + +_404_page = r""" + +Speech recognition with next-gen Kaldi +

404 ERROR! Please re-check your URL

+ +""" + + +def read_file(root: str, name: str) -> str: + try: + with open(f"{root}/{name}") as f: + return f.read() + except: # noqa + with open(f"{root}/{name}", "rb") as f: + return f.read() + + +class HttpServer: + """ + A simple HTTP server that hosts only static files + """ + + def __init__(self, doc_root: str): + content = dict() + for f, mime_type in _static_files: + content[f] = (read_file(doc_root, f), mime_type) + self.content = content + + def process_request(self, f: str) -> Tuple[str, str, str]: + """ + Args: + f: + The filename to read. + Returns: + Return a tuple: + - a bool, True if the given file is found. False otherwise. + - a str, the content of the file if found. Otherwise, it + contains the content for the 404 page + - a str, the MIME type of the returned content + """ + if f in self.content: + return True, self.content[f][0], self.content[f][1] + else: + return False, _404_page, "text/html" diff --git a/sherpa/python/sherpa/lexicon.py b/sherpa/python/sherpa/lexicon.py deleted file mode 100644 index 80bd7c1ee..000000000 --- a/sherpa/python/sherpa/lexicon.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -import re -import sys -from pathlib import Path -from typing import List, Tuple - -import k2 -import torch - - -def read_lexicon(filename: str) -> List[Tuple[str, List[str]]]: - """Read a lexicon from `filename`. - - Each line in the lexicon contains "word p1 p2 p3 ...". - That is, the first field is a word and the remaining - fields are tokens. Fields are separated by space(s). - - Args: - filename: - Path to the lexicon.txt - - Returns: - A list of tuples., e.g., [('w', ['p1', 'p2']), ('w1', ['p3, 'p4'])] - """ - ans = [] - - with open(filename, "r", encoding="utf-8") as f: - whitespace = re.compile("[ \t]+") - for line in f: - a = whitespace.split(line.strip(" \t\r\n")) - if len(a) == 0: - continue - - if len(a) < 2: - logging.info( - f"Found bad line {line} in lexicon file {filename}" - ) - logging.info( - "Every line is expected to contain at least 2 fields" - ) - sys.exit(1) - word = a[0] - if word == "": - logging.info( - f"Found bad line {line} in lexicon file {filename}" - ) - logging.info(" should not be a valid word") - sys.exit(1) - - tokens = a[1:] - ans.append((word, tokens)) - - return ans - - -def write_lexicon(filename: str, lexicon: List[Tuple[str, List[str]]]) -> None: - """Write a lexicon to a file. - - Args: - filename: - Path to the lexicon file to be generated. - lexicon: - It can be the return value of :func:`read_lexicon`. - """ - with open(filename, "w", encoding="utf-8") as f: - for word, tokens in lexicon: - f.write(f"{word} {' '.join(tokens)}\n") - - -def convert_lexicon_to_ragged( - filename: str, word_table: k2.SymbolTable, token_table: k2.SymbolTable -) -> k2.RaggedTensor: - """Read a lexicon and convert it to a ragged tensor. - - The ragged tensor has two axes: [word][token]. - - Caution: - We assume that each word has a unique pronunciation. - - Args: - filename: - Filename of the lexicon. It has a format that can be read - by :func:`read_lexicon`. - word_table: - The word symbol table. - token_table: - The token symbol table. - Returns: - A k2 ragged tensor with two axes [word][token]. - """ - disambig_id = word_table["#0"] - # We reuse the same words.txt from the phone based lexicon - # so that we can share the same G.fst. Here, we have to - # exclude some words present only in the phone based lexicon. - excluded_words = ["", "!SIL", ""] - - # epsilon is not a word, but it occupies a position - # - row_splits = [0] - token_ids_list = [] - - lexicon_tmp = read_lexicon(filename) - lexicon = dict(lexicon_tmp) - if len(lexicon_tmp) != len(lexicon): - raise RuntimeError( - "It's assumed that each word has a unique pronunciation" - ) - - for i in range(disambig_id): - w = word_table[i] - if w in excluded_words: - row_splits.append(row_splits[-1]) - continue - tokens = lexicon[w] - token_ids = [token_table[k] for k in tokens] - - row_splits.append(row_splits[-1] + len(token_ids)) - token_ids_list.extend(token_ids) - - cached_tot_size = row_splits[-1] - row_splits = torch.tensor(row_splits, dtype=torch.int32) - - shape = k2.ragged.create_ragged_shape2( - row_splits, - None, - cached_tot_size, - ) - values = torch.tensor(token_ids_list, dtype=torch.int32) - - return k2.RaggedTensor(shape, values) - - -class Lexicon(object): - """Phone based lexicon.""" - - def __init__( - self, - lang_dir: Path, - disambig_pattern: str = re.compile(r"^#\d+$"), - ): - """ - Args: - lang_dir: - Path to the lang directory. It is expected to contain the following - files: - - tokens.txt - - words.txt - - L.pt - The above files are produced by the script `prepare.sh`. You - should have run that before running the training code. - disambig_pattern: - It contains the pattern for disambiguation symbols. - """ - lang_dir = Path(lang_dir) - self.token_table = k2.SymbolTable.from_file(lang_dir / "tokens.txt") - self.word_table = k2.SymbolTable.from_file(lang_dir / "words.txt") - - if (lang_dir / "Linv.pt").exists(): - logging.info(f"Loading pre-compiled {lang_dir}/Linv.pt") - L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt")) - else: - logging.info("Converting L.pt to Linv.pt") - L = k2.Fsa.from_dict(torch.load(lang_dir / "L.pt")) - L_inv = k2.arc_sort(L.invert()) - torch.save(L_inv.as_dict(), lang_dir / "Linv.pt") - - # We save L_inv instead of L because it will be used to intersect with - # transcript FSAs, both of whose labels are word IDs. - self.L_inv = L_inv - self.disambig_pattern = disambig_pattern - - @property - def tokens(self) -> List[int]: - """Return a list of token IDs excluding those from - disambiguation symbols. - - Caution: - 0 is not a token ID so it is excluded from the return value. - """ - symbols = self.token_table.symbols - ans = [] - for s in symbols: - if not self.disambig_pattern.match(s): - ans.append(self.token_table[s]) - if 0 in ans: - ans.remove(0) - ans.sort() - return ans - - -class UniqLexicon(Lexicon): - def __init__( - self, - lang_dir: Path, - uniq_filename: str = "uniq_lexicon.txt", - disambig_pattern: str = re.compile(r"^#\d+$"), - ): - """ - Refer to the help information in Lexicon.__init__. - - uniq_filename: It is assumed to be inside the given `lang_dir`. - - Each word in the lexicon is assumed to have a unique pronunciation. - """ - lang_dir = Path(lang_dir) - super().__init__(lang_dir=lang_dir, disambig_pattern=disambig_pattern) - - self.ragged_lexicon = convert_lexicon_to_ragged( - filename=lang_dir / uniq_filename, - word_table=self.word_table, - token_table=self.token_table, - ) - # TODO: should we move it to a certain device ? - - def texts_to_token_ids( - self, texts: List[str], oov: str = "" - ) -> k2.RaggedTensor: - """ - Args: - texts: - A list of transcripts. Each transcript contains space(s) - separated words. An example texts is:: - - ['HELLO k2', 'HELLO icefall'] - oov: - The OOV word. If a word in `texts` is not in the lexicon, it is - replaced with `oov`. - Returns: - Return a ragged int tensor with 2 axes [utterance][token_id] - """ - oov_id = self.word_table[oov] - - word_ids_list = [] - for text in texts: - word_ids = [] - for word in text.split(): - if word in self.word_table: - word_ids.append(self.word_table[word]) - else: - word_ids.append(oov_id) - word_ids_list.append(word_ids) - ragged_indexes = k2.RaggedTensor(word_ids_list, dtype=torch.int32) - ans = self.ragged_lexicon.index(ragged_indexes) - ans = ans.remove_axis(ans.num_axes - 2) - return ans - - def words_to_token_ids(self, words: List[str]) -> k2.RaggedTensor: - """Convert a list of words to a ragged tensor containing token IDs. - - We assume there are no OOVs in "words". - """ - word_ids = [self.word_table[w] for w in words] - word_ids = torch.tensor(word_ids, dtype=torch.int32) - - ragged, _ = self.ragged_lexicon.index( - indexes=word_ids, - axis=0, - need_value_indexes=False, - ) - return ragged diff --git a/sherpa/python/sherpa/nbest.py b/sherpa/python/sherpa/nbest.py deleted file mode 100644 index b0f68c65a..000000000 --- a/sherpa/python/sherpa/nbest.py +++ /dev/null @@ -1,415 +0,0 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import k2 -import torch - - -def _intersect_device( - a_fsas: k2.Fsa, - b_fsas: k2.Fsa, - b_to_a_map: torch.Tensor, - sorted_match_a: bool, - batch_size: int = 50, -) -> k2.Fsa: - """This is a wrapper of k2.intersect_device and its purpose is to split - b_fsas into several batches and process each batch separately to avoid - CUDA OOM error. - - The arguments and return value of this function are the same as - :func:`k2.intersect_device`. - """ - num_fsas = b_fsas.shape[0] - if num_fsas <= batch_size: - return k2.intersect_device( - a_fsas, b_fsas, b_to_a_map=b_to_a_map, sorted_match_a=sorted_match_a - ) - - num_batches = (num_fsas + batch_size - 1) // batch_size - splits = [] - for i in range(num_batches): - start = i * batch_size - end = min(start + batch_size, num_fsas) - splits.append((start, end)) - - ans = [] - for start, end in splits: - indexes = torch.arange(start, end).to(b_to_a_map) - - fsas = k2.index_fsa(b_fsas, indexes) - b_to_a = k2.index_select(b_to_a_map, indexes) - path_lattice = k2.intersect_device( - a_fsas, fsas, b_to_a_map=b_to_a, sorted_match_a=sorted_match_a - ) - ans.append(path_lattice) - - return k2.cat(ans) - - -def get_lattice( - nnet_output: torch.Tensor, - decoding_graph: k2.Fsa, - supervision_segments: torch.Tensor, - search_beam: float, - output_beam: float, - min_active_states: int, - max_active_states: int, - subsampling_factor: int = 1, -) -> k2.Fsa: - """Get the decoding lattice from a decoding graph and neural - network output. - Args: - nnet_output: - It is the output of a neural model of shape `(N, T, C)`. - decoding_graph: - An Fsa, the decoding graph. It can be either an HLG - (see `compile_HLG.py`) or an H (see `k2.ctc_topo`). - supervision_segments: - A 2-D **CPU** tensor of dtype `torch.int32` with 3 columns. - Each row contains information for a supervision segment. Column 0 - is the `sequence_index` indicating which sequence this segment - comes from; column 1 specifies the `start_frame` of this segment - within the sequence; column 2 contains the `duration` of this - segment. - search_beam: - Decoding beam, e.g. 20. Smaller is faster, larger is more exact - (less pruning). This is the default value; it may be modified by - `min_active_states` and `max_active_states`. - output_beam: - Beam to prune output, similar to lattice-beam in Kaldi. Relative - to best path of output. - min_active_states: - Minimum number of FSA states that are allowed to be active on any given - frame for any given intersection/composition task. This is advisory, - in that it will try not to have fewer than this number active. - Set it to zero if there is no constraint. - max_active_states: - Maximum number of FSA states that are allowed to be active on any given - frame for any given intersection/composition task. This is advisory, - in that it will try not to exceed that but may not always succeed. - You can use a very large number if no constraint is needed. - subsampling_factor: - The subsampling factor of the model. - Returns: - An FsaVec containing the decoding result. It has axes [utt][state][arc]. - """ - dense_fsa_vec = k2.DenseFsaVec( - nnet_output, - supervision_segments, - allow_truncate=subsampling_factor - 1, - ) - - lattice = k2.intersect_dense_pruned( - decoding_graph, - dense_fsa_vec, - search_beam=search_beam, - output_beam=output_beam, - min_active_states=min_active_states, - max_active_states=max_active_states, - ) - - return lattice - - -class Nbest(object): - """ - An Nbest object contains two fields: - - (1) fsa. It is an FsaVec containing a vector of **linear** FSAs. - Its axes are [path][state][arc] - (2) shape. Its type is :class:`k2.RaggedShape`. - Its axes are [utt][path] - - The field `shape` has two axes [utt][path]. `shape.dim0` contains - the number of utterances, which is also the number of rows in the - supervision_segments. `shape.tot_size(1)` contains the number - of paths, which is also the number of FSAs in `fsa`. - - Caution: - Don't be confused by the name `Nbest`. The best in the name `Nbest` - has nothing to do with `best scores`. The important part is - `N` in `Nbest`, not `best`. - """ - - def __init__(self, fsa: k2.Fsa, shape: k2.RaggedShape) -> None: - """ - Args: - fsa: - An FsaVec with axes [path][state][arc]. It is expected to contain - a list of **linear** FSAs. - shape: - A ragged shape with two axes [utt][path]. - """ - assert len(fsa.shape) == 3, f"fsa.shape: {fsa.shape}" - assert shape.num_axes == 2, f"num_axes: {shape.num_axes}" - - if fsa.shape[0] != shape.tot_size(1): - raise ValueError( - f"{fsa.shape[0]} vs {shape.tot_size(1)}\n" - "Number of FSAs in `fsa` does not match the given shape" - ) - - self.fsa = fsa - self.shape = shape - - def __str__(self): - s = "Nbest(" - s += f"Number of utterances:{self.shape.dim0}, " - s += f"Number of Paths:{self.fsa.shape[0]})" - return s - - @staticmethod - def from_lattice( - lattice: k2.Fsa, - num_paths: int, - use_double_scores: bool = True, - nbest_scale: float = 0.5, - ) -> "Nbest": - """Construct an Nbest object by **sampling** `num_paths` from a lattice. - - Each sampled path is a linear FSA. - - We assume `lattice.labels` contains token IDs and `lattice.aux_labels` - contains word IDs. - - Args: - lattice: - An FsaVec with axes [utt][state][arc]. - num_paths: - Number of paths to **sample** from the lattice - using :func:`k2.random_paths`. - use_double_scores: - True to use double precision in :func:`k2.random_paths`. - False to use single precision. - scale: - Scale `lattice.score` before passing it to :func:`k2.random_paths`. - A smaller value leads to more unique paths at the risk of being not - to sample the path with the best score. - Returns: - Return an Nbest instance. - """ - saved_scores = lattice.scores.clone() - lattice.scores *= nbest_scale - # path is a ragged tensor with dtype torch.int32. - # It has three axes [utt][path][arc_pos] - path = k2.random_paths( - lattice, num_paths=num_paths, use_double_scores=use_double_scores - ) - lattice.scores = saved_scores - - # word_seq is a k2.RaggedTensor sharing the same shape as `path` - # but it contains word IDs. Note that it also contains 0s and -1s. - # The last entry in each sublist is -1. - # It axes is [utt][path][word_id] - if isinstance(lattice.aux_labels, torch.Tensor): - word_seq = k2.ragged.index(lattice.aux_labels, path) - else: - word_seq = lattice.aux_labels.index(path) - word_seq = word_seq.remove_axis(word_seq.num_axes - 2) - word_seq = word_seq.remove_values_leq(0) - - # Each utterance has `num_paths` paths but some of them transduces - # to the same word sequence, so we need to remove repeated word - # sequences within an utterance. After removing repeats, each utterance - # contains different number of paths - # - # `new2old` is a 1-D torch.Tensor mapping from the output path index - # to the input path index. - _, _, new2old = word_seq.unique( - need_num_repeats=False, need_new2old_indexes=True - ) - - # kept_path is a ragged tensor with dtype torch.int32. - # It has axes [utt][path][arc_pos] - kept_path, _ = path.index(new2old, axis=1, need_value_indexes=False) - - # utt_to_path_shape has axes [utt][path] - utt_to_path_shape = kept_path.shape.get_layer(0) - - # Remove the utterance axis. - # Now kept_path has only two axes [path][arc_pos] - kept_path = kept_path.remove_axis(0) - - # labels is a ragged tensor with 2 axes [path][token_id] - # Note that it contains -1s. - labels = k2.ragged.index(lattice.labels.contiguous(), kept_path) - - # Remove -1 from labels as we will use it to construct a linear FSA - labels = labels.remove_values_eq(-1) - - if isinstance(lattice.aux_labels, k2.RaggedTensor): - # lattice.aux_labels is a ragged tensor with dtype torch.int32. - # It has 2 axes [arc][word], so aux_labels is also a ragged tensor - # with 2 axes [arc][word] - aux_labels, _ = lattice.aux_labels.index( - indexes=kept_path.values, axis=0, need_value_indexes=False - ) - else: - assert isinstance(lattice.aux_labels, torch.Tensor) - aux_labels = k2.index_select(lattice.aux_labels, kept_path.values) - # aux_labels is a 1-D torch.Tensor. It also contains -1 and 0. - - fsa = k2.linear_fsa(labels) - fsa.aux_labels = aux_labels - # Caution: fsa.scores are all 0s. - # `fsa` has only one extra attribute: aux_labels. - return Nbest(fsa=fsa, shape=utt_to_path_shape) - - def intersect(self, lattice: k2.Fsa, use_double_scores=True) -> "Nbest": - """Intersect this Nbest object with a lattice, get 1-best - path from the resulting FsaVec, and return a new Nbest object. - - The purpose of this function is to attach scores to an Nbest. - - Args: - lattice: - An FsaVec with axes [utt][state][arc]. If it has `aux_labels`, then - we assume its `labels` are token IDs and `aux_labels` are word IDs. - If it has only `labels`, we assume its `labels` are word IDs. - use_double_scores: - True to use double precision when computing shortest path. - False to use single precision. - Returns: - Return a new Nbest. This new Nbest shares the same shape with `self`, - while its `fsa` is the 1-best path from intersecting `self.fsa` and - `lattice`. Also, its `fsa` has non-zero scores and inherits attributes - for `lattice`. - """ - # Note: We view each linear FSA as a word sequence - # and we use the passed lattice to give each word sequence a score. - # - # We are not viewing each linear FSAs as a token sequence. - # - # So we use k2.invert() here. - - # We use a word fsa to intersect with k2.invert(lattice) - word_fsa = k2.invert(self.fsa) - - if hasattr(lattice, "aux_labels"): - # delete token IDs as it is not needed - del word_fsa.aux_labels - - word_fsa.scores.zero_() - word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa) - - path_to_utt_map = self.shape.row_ids(1) - - if hasattr(lattice, "aux_labels"): - # lattice has token IDs as labels and word IDs as aux_labels. - # inv_lattice has word IDs as labels and token IDs as aux_labels - inv_lattice = k2.invert(lattice) - inv_lattice = k2.arc_sort(inv_lattice) - else: - inv_lattice = k2.arc_sort(lattice) - - if inv_lattice.shape[0] == 1: - path_lattice = _intersect_device( - inv_lattice, - word_fsa_with_epsilon_loops, - b_to_a_map=torch.zeros_like(path_to_utt_map), - sorted_match_a=True, - ) - else: - path_lattice = _intersect_device( - inv_lattice, - word_fsa_with_epsilon_loops, - b_to_a_map=path_to_utt_map, - sorted_match_a=True, - ) - - # path_lattice has word IDs as labels and token IDs as aux_labels - path_lattice = k2.top_sort(k2.connect(path_lattice)) - - one_best = k2.shortest_path( - path_lattice, use_double_scores=use_double_scores - ) - - one_best = k2.invert(one_best) - # Now one_best has token IDs as labels and word IDs as aux_labels - - return Nbest(fsa=one_best, shape=self.shape) - - def compute_am_scores(self) -> k2.RaggedTensor: - """Compute AM scores of each linear FSA (i.e., each path within - an utterance). - - Hint: - `self.fsa.scores` contains two parts: acoustic scores (AM scores) - and n-gram language model scores (LM scores). - - Caution: - We require that ``self.fsa`` has an attribute ``lm_scores``. - - Returns: - Return a ragged tensor with 2 axes [utt][path_scores]. - Its dtype is torch.float64. - """ - scores_shape = self.fsa.arcs.shape().remove_axis(1) - # scores_shape has axes [path][arc] - am_scores = self.fsa.scores - self.fsa.lm_scores - ragged_am_scores = k2.RaggedTensor(scores_shape, am_scores.contiguous()) - tot_scores = ragged_am_scores.sum() - - return k2.RaggedTensor(self.shape, tot_scores) - - def compute_lm_scores(self) -> k2.RaggedTensor: - """Compute LM scores of each linear FSA (i.e., each path within - an utterance). - - Hint: - `self.fsa.scores` contains two parts: acoustic scores (AM scores) - and n-gram language model scores (LM scores). - - Caution: - We require that ``self.fsa`` has an attribute ``lm_scores``. - - Returns: - Return a ragged tensor with 2 axes [utt][path_scores]. - Its dtype is torch.float64. - """ - scores_shape = self.fsa.arcs.shape().remove_axis(1) - # scores_shape has axes [path][arc] - - ragged_lm_scores = k2.RaggedTensor( - scores_shape, self.fsa.lm_scores.contiguous() - ) - - tot_scores = ragged_lm_scores.sum() - - return k2.RaggedTensor(self.shape, tot_scores) - - def tot_scores(self) -> k2.RaggedTensor: - """Get total scores of FSAs in this Nbest. - - Note: - Since FSAs in Nbest are just linear FSAs, log-semiring - and tropical semiring produce the same total scores. - - Returns: - Return a ragged tensor with two axes [utt][path_scores]. - Its dtype is torch.float64. - """ - scores_shape = self.fsa.arcs.shape().remove_axis(1) - # scores_shape has axes [path][arc] - - ragged_scores = k2.RaggedTensor( - scores_shape, self.fsa.scores.contiguous() - ) - - tot_scores = ragged_scores.sum() - - return k2.RaggedTensor(self.shape, tot_scores) diff --git a/sherpa/python/sherpa/online_endpoint.py b/sherpa/python/sherpa/online_endpoint.py deleted file mode 100644 index 48ec05d10..000000000 --- a/sherpa/python/sherpa/online_endpoint.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -This file implements endpoint detection using -https://github.com/kaldi-asr/kaldi/blob/master/src/online2/online-endpoint.h -as a reference -""" -import argparse -from dataclasses import dataclass - -from .utils import str2bool - - -@dataclass -class OnlineEndpointRule: - # If True, for this endpointing rule to apply there must - # be nonsilence in the best-path traceback. - # For RNN-T decoding, a non-blank token is considered as non-silence - must_contain_nonsilence: bool - - # This endpointing rule requires duration of trailing silence - # (in seconds) to be >= this value. - min_trailing_silence: float - - # This endpointing rule requires utterance-length (in seconds) - # to be >= this value. - min_utterance_length: float - - -@dataclass -class OnlineEndpointConfig: - # rule1 times out after 5 seconds of silence, even if we decoded nothing. - rule1: OnlineEndpointRule = OnlineEndpointRule( - must_contain_nonsilence=False, - min_trailing_silence=5.0, - min_utterance_length=0.0, - ) - - # rule2 times out after 2.0 seconds of silence after decoding something, - rule2: OnlineEndpointRule = OnlineEndpointRule( - must_contain_nonsilence=True, - min_trailing_silence=2.0, - min_utterance_length=0.0, - ) - # rule3 times out after the utterance is 20 seconds long, regardless of - # anything else. - rule3: OnlineEndpointRule = OnlineEndpointRule( - must_contain_nonsilence=False, - min_trailing_silence=0.0, - min_utterance_length=20.0, - ) - - @staticmethod - def from_args(args: dict) -> "OnlineEndpointConfig": - """ - Args: - args: - It contains the arguments parsed from - :func:`add_online_endpoint_arguments` - """ - rule1 = OnlineEndpointRule( - must_contain_nonsilence=args[ - "endpoint_rule1_must_contain_nonsilence" - ], - min_trailing_silence=args["endpoint_rule1_min_trailing_silence"], - min_utterance_length=args["endpoint_rule1_min_utterance_length"], - ) - - rule2 = OnlineEndpointRule( - must_contain_nonsilence=args[ - "endpoint_rule2_must_contain_nonsilence" - ], - min_trailing_silence=args["endpoint_rule2_min_trailing_silence"], - min_utterance_length=args["endpoint_rule2_min_utterance_length"], - ) - - rule3 = OnlineEndpointRule( - must_contain_nonsilence=args[ - "endpoint_rule3_must_contain_nonsilence" - ], - min_trailing_silence=args["endpoint_rule3_min_trailing_silence"], - min_utterance_length=args["endpoint_rule3_min_utterance_length"], - ) - - return OnlineEndpointConfig(rule1=rule1, rule2=rule2, rule3=rule3) - - -def _add_rule_arguments( - parser: argparse.ArgumentParser, - prefix: str, - rule: OnlineEndpointRule, -): - p = prefix.replace(".", "_") - - parser.add_argument( - f"--{prefix}.must-contain-nonsilence", - type=str2bool, - dest=f"{p}_must_contain_nonsilence", - default=rule.must_contain_nonsilence, - help="""If true, for this endpointing rule to apply there must be - nonsilence in the best-path traceback. For RNN-T decoding, a non-blank - token is considered as non-silence""", - ) - - parser.add_argument( - f"--{prefix}.min-trailing-silence", - type=float, - dest=f"{p}_min_trailing_silence", - default=rule.min_trailing_silence, - help="""This endpointing rule requires duration of trailing silence - (in seconds) to be >= this value.""", - ) - - parser.add_argument( - f"--{prefix}.min-utterance-length", - type=float, - dest=f"{p}_min_utterance_length", - default=rule.min_utterance_length, - help="""This endpointing rule requires utterance-length (in seconds) - to be >= this value.""", - ) - - -def add_online_endpoint_arguments(): - """Add command line arguments to configure online endpointing. - - It provides the following commandline arguments: - - --endpoint.rule1.must-contain-nonsilence - --endpoint.rule1.min_trailing_silence - --endpoint.rule1.min_utterance_length - - --endpoint.rule2.must-contain-nonsilence - --endpoint.rule2.min_trailing_silence - --endpoint.rule2.min_utterance_length - - --endpoint.rule3.must-contain-nonsilence - --endpoint.rule3.min_trailing_silence - --endpoint.rule3.min_utterance_length - - You can add more rules if there is a need. - """ - parser = argparse.ArgumentParser( - description="Parameters for online endpoint detection", - add_help=False, - ) - - config = OnlineEndpointConfig() - _add_rule_arguments(parser, prefix="endpoint.rule1", rule=config.rule1) - _add_rule_arguments(parser, prefix="endpoint.rule2", rule=config.rule2) - _add_rule_arguments(parser, prefix="endpoint.rule3", rule=config.rule3) - - return parser - - -def _rule_activated( - rule: OnlineEndpointRule, - trailing_silence: float, - utterance_length: float, -): - """ - Args: - rule: - The rule to be checked. - trailing_silence: - Trailing silence in seconds. - utterance_length: - Number of frames in seconds decoded so far. - Returns: - Return True if the given rule is activated; return False otherwise. - """ - contains_nonsilence = utterance_length > trailing_silence - - return ( - (contains_nonsilence or not rule.must_contain_nonsilence) - and (trailing_silence > rule.min_trailing_silence) - and (utterance_length > rule.min_utterance_length) - ) - - -def endpoint_detected( - config: OnlineEndpointConfig, - num_frames_decoded: int, - trailing_silence_frames: int, - frame_shift_in_seconds: float, -) -> bool: - """ - Args: - config: - The endpoint config to be checked. - num_frames_decoded: - Number of frames decoded so far. - trailing_silence_frames: - Number of trailing silence frames. - frame_shift_in_seconds: - Frame shift in seconds. - Returns: - Return True if any rule in `config` is activated; return False otherwise. - """ - utterance_length = num_frames_decoded * frame_shift_in_seconds - trailing_silence = trailing_silence_frames * frame_shift_in_seconds - - if _rule_activated(config.rule1, trailing_silence, utterance_length): - return True - - if _rule_activated(config.rule2, trailing_silence, utterance_length): - return True - - if _rule_activated(config.rule3, trailing_silence, utterance_length): - return True - - return False diff --git a/sherpa/python/sherpa/utils.py b/sherpa/python/sherpa/utils.py index d3a82ea54..b740084ea 100644 --- a/sherpa/python/sherpa/utils.py +++ b/sherpa/python/sherpa/utils.py @@ -1,8 +1,78 @@ import argparse +import logging +import re +from dataclasses import dataclass +from datetime import datetime from pathlib import Path -from typing import List, Tuple, Union +from typing import Dict, List, Optional, Union -import k2 +Pathlike = Union[str, Path] + + +def setup_logger( + log_filename: Pathlike, + log_level: str = "info", + use_console: bool = True, +) -> None: + """Setup log level. + + Args: + log_filename: + The filename to save the log. + log_level: + The log level to use, e.g., "debug", "info", "warning", "error", + "critical" + use_console: + True to also print logs to console. + """ + now = datetime.now() + date_time = now.strftime("%Y-%m-%d-%H-%M-%S") + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + log_filename = f"{log_filename}-{date_time}.txt" + + Path(log_filename).parent.mkdir(parents=True, exist_ok=True) + + level = logging.ERROR + if log_level == "debug": + level = logging.DEBUG + elif log_level == "info": + level = logging.INFO + elif log_level == "warning": + level = logging.WARNING + elif log_level == "critical": + level = logging.CRITICAL + + logging.basicConfig( + filename=log_filename, + format=formatter, + level=level, + filemode="w", + ) + if use_console: + console = logging.StreamHandler() + console.setLevel(level) + console.setFormatter(logging.Formatter(formatter)) + logging.getLogger("").addHandler(console) + + +@dataclass +class FastBeamSearchResults: + # hyps[i] is the recognition results for the i-th utterance. + # It may contain either token IDs or word IDs depending on the actual + # decoding method. + hyps: List[List[int]] + + # Number of trailing blank for each utterance in the batch + num_trailing_blanks: List[int] + + # Decoded token IDs for each utterance in the batch + tokens: List[List[int]] + + # timestamps[i][k] contains the frame number on which tokens[i][k] + # is decoded + timestamps: List[List[int]] def str2bool(v): @@ -24,156 +94,78 @@ def str2bool(v): raise argparse.ArgumentTypeError("Boolean value expected.") -def count_num_trailing_zeros(labels: List[int]): - """Return the number of trailing zeros in labels.""" - n = 0 - for v in reversed(labels): - if v != 0: - break - else: - n += 1 - return n - +def encode_contexts( + modeling_unit: str, + contexts: List[str], + sp: Optional["SentencePieceProcessor"] = None, # noqa + tokens_table: Optional[Dict[str, int]] = None, +) -> List[List[int]]: + """ + Encode the given contexts (a list of string) to a list of a list of token + ids. -def get_texts_and_num_trailing_blanks( - best_paths: k2.Fsa, -) -> Tuple[Union[List[List[int]], k2.RaggedTensor], List[int]]: - """Extract the texts (as word IDs) from the best-path FSAs. Args: - best_paths: - A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. - containing multiple FSAs, which is expected to be the result - of k2.shortest_path (otherwise the returned values won't - be meaningful). + modeling_unit: + The valid values are bpe, char, bpe+char. + Note: char here means characters in CJK languages, not English like + languages. + contexts: + The given contexts list (a list of string). + sp: + An instance of SentencePieceProcessor. + tokens_table: + The tokens_table containing the tokens and the corresponding ids. Returns: - Return a tuple containing - - a list of lists of int, containing the label sequences we decoded. - - number of trailing blank frames + Return the contexts_list, it is a list of a list of token ids. """ - if isinstance(best_paths.aux_labels, k2.RaggedTensor): - # remove 0's and -1's. - aux_labels = best_paths.aux_labels.remove_values_leq(0) - # TODO: change arcs.shape() to arcs.shape - aux_shape = best_paths.arcs.shape().compose(aux_labels.shape) - - # remove the states and arcs axes. - aux_shape = aux_shape.remove_axis(1) - aux_shape = aux_shape.remove_axis(1) - aux_labels = k2.RaggedTensor(aux_shape, aux_labels.values) + contexts_list = [] + if "bpe" in modeling_unit: + assert sp is not None + if "char" in modeling_unit: + assert tokens_table is not None + assert len(tokens_table) > 0, len(tokens_table) + + if "char" == modeling_unit: + for context in contexts: + assert " " not in context + ids = [ + tokens_table[txt] + if txt in tokens_table + else tokens_table[""] + for txt in context + ] + contexts_list.append(ids) + elif "bpe" == modeling_unit: + contexts_list = sp.encode(contexts, out_type=int) else: - # remove axis corresponding to states. - aux_shape = best_paths.arcs.shape().remove_axis(1) - aux_labels = k2.RaggedTensor(aux_shape, best_paths.aux_labels) - # remove 0's and -1's. - aux_labels = aux_labels.remove_values_leq(0) - - assert aux_labels.num_axes == 2 - - # Now count number of trailing frames - labels_shape = best_paths.arcs.shape().remove_axis(1) - labels_list = k2.RaggedTensor( - labels_shape, best_paths.labels.contiguous() - ).tolist() - - num_trailing_blanks = [] - for labels in labels_list: - # [:-1] to remove the last -1 - num_trailing_blanks.append(count_num_trailing_zeros(labels[:-1])) - - return aux_labels.tolist(), num_trailing_blanks - - -def add_beam_search_arguments(): - parser = argparse.ArgumentParser( - description="Parameters for beam search", add_help=False - ) - - parser.add_argument( - "--decoding-method", - type=str, - default="greedy_search", - help="""Decoding method to use. Possible values are: - greedy_search, fast_beam_search, fast_beam_search_nbest - """, - ) - - parser.add_argument( - "--beam", - type=float, - default=10.0, - help="""A floating point value to calculate the cutoff score during beam - search (i.e., `cutoff = max-score - beam`), which is the same as the - `beam` in Kaldi. - Used only when --decoding-method is fast_beam_search. - """, - ) - - parser.add_argument( - "--num-paths", - type=int, - default=200, - help="""Number of paths for nbest decoding. - Used only when the decoding method is fast_beam_search_nbest, - fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""", - ) - - parser.add_argument( - "--num-active-paths", - type=int, - default=4, - help="""Used only when decoding_method is modified_beam_search. - It specifies number of active paths for each utterance. Due to - merging paths with identical token sequences, the actual number - may be less than "num_active_paths".""", - ) - - parser.add_argument( - "--nbest-scale", - type=float, - default=0.5, - help="""Scale applied to lattice scores when computing nbest paths. - Used only when the decoding method is fast_beam_search_nbest, - fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="""Softmax temperature. - The output of the model is (logits / temperature).log_softmax(). - """, - ) - - parser.add_argument( - "--max-contexts", - type=int, - default=8, - help="""Used only when --decoding-method is fast_beam_search.""", - ) - - parser.add_argument( - "--max-states", - type=int, - default=32, - help="""Used only when --decoding-method is fast_beam_search.""", - ) - - parser.add_argument( - "--lang-dir", - type=Path, - default="data/lang_bpe_500", - help="The lang dir containing word table and LG graph", - ) - - parser.add_argument( - "--ngram-lm-scale", - type=float, - default=0.01, - help=""" - Used only when --decoding_method is fast_beam_search_nbest_LG. - It specifies the scale for n-gram LM scores. - """, - ) - - return parser + assert modeling_unit == "bpe+char", modeling_unit + + # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + pattern = re.compile(r"([\u4e00-\u9fff])") + for context in contexts: + # Example: + # txt = "你好 ITS'S OKAY 的" + # chars = ["你", "好", " ITS'S OKAY ", "的"] + chars = pattern.split(context.upper()) + mix_chars = [w for w in chars if len(w.strip()) > 0] + ids = [] + for ch_or_w in mix_chars: + # ch_or_w is a single CJK charater(i.e., "你"), do nothing. + if pattern.fullmatch(ch_or_w) is not None: + ids.append( + tokens_table[ch_or_w] + if ch_or_w in tokens_table + else tokens_table[""] + ) + # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), + # encode ch_or_w using bpe_model. + else: + for p in sp.encode_as_pieces(ch_or_w): + ids.append( + tokens_table[p] + if p in tokens_table + else tokens_table[""] + ) + contexts_list.append(ids) + return contexts_list diff --git a/sherpa/python/test/CMakeLists.txt b/sherpa/python/test/CMakeLists.txt index fd93fc69c..2b0a58220 100644 --- a/sherpa/python/test/CMakeLists.txt +++ b/sherpa/python/test/CMakeLists.txt @@ -17,9 +17,12 @@ endfunction() # please sort the files in alphabetic order set(py_test_files - test_hypothesis.py - test_online_endpoint.py - test_utils.py + test_feature_config.py + test_offline_ctc_decoder_config.py + test_offline_recognizer.py + test_offline_recognizer_config.py + test_online_recognizer.py + test_online_recognizer_config.py ) foreach(source IN LISTS py_test_files) diff --git a/sherpa/python/test/test_feature_config.py b/sherpa/python/test/test_feature_config.py new file mode 100755 index 000000000..2bc0406e5 --- /dev/null +++ b/sherpa/python/test/test_feature_config.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# To run this single test, use +# +# ctest --verbose -R test_feature_config_py + +import unittest + +import torch + +import sherpa + + +class TestFeatureConfig(unittest.TestCase): + def test_default_constructor(self): + config = sherpa.FeatureConfig() + print() + print(config) + assert config.normalize_samples is True + + def test_constructor(self): + config = sherpa.FeatureConfig(normalize_samples=False) + assert config.normalize_samples is False + + config.fbank_opts.mel_opts.num_bins = 80 + config.fbank_opts.device = "cuda:1" + + assert config.fbank_opts.mel_opts.num_bins == 80 + assert config.fbank_opts.device == torch.device("cuda", 1) + + print() + print(config) + + +if __name__ == "__main__": + unittest.main() diff --git a/sherpa/python/test/test_hypothesis.py b/sherpa/python/test/test_hypothesis.py deleted file mode 100755 index a2d3ba7e4..000000000 --- a/sherpa/python/test/test_hypothesis.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# To run this single test, use -# -# ctest --verbose -R test_hypothesis_py - -import unittest - -import sherpa - - -class TestHypothesis(unittest.TestCase): - def test_hypothesis_default_constructor(self): - hyp = sherpa.Hypothesis() - assert hyp.ys == [], hyp.ys - assert hyp.log_prob == 0, hyp.log_prob - - def test_hypothesis_constructor(self): - hyp = sherpa.Hypothesis(ys=[1, 2, 3], log_prob=0.5) - assert hyp.ys == [1, 2, 3], hyp.ys - assert hyp.log_prob == 0.5, hyp.log_prob - assert hyp.key == "-".join(map(str, hyp.ys)) == "1-2-3" - - -if __name__ == "__main__": - unittest.main() diff --git a/sherpa/python/test/test_offline_ctc_decoder_config.py b/sherpa/python/test/test_offline_ctc_decoder_config.py new file mode 100755 index 000000000..5b5f7d71d --- /dev/null +++ b/sherpa/python/test/test_offline_ctc_decoder_config.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# To run this single test, use +# +# ctest --verbose -R test_offline_ctc_decoder_config_py + +import unittest + +import sherpa + + +class TestOfflineCtcDecoderConfig(unittest.TestCase): + def test_default_constructor(self): + config = sherpa.OfflineCtcDecoderConfig() + print(config) + assert config.modified is True + assert config.hlg == "" + assert config.search_beam == 20 + assert config.output_beam == 8 + assert config.min_active_states == 20 + assert config.max_active_states == 10000 + + def test_constructor(self): + config = sherpa.OfflineCtcDecoderConfig( + modified=False, + hlg="a.pt", + search_beam=22, + output_beam=10, + min_active_states=10, + max_active_states=300, + ) + print(config) + assert config.modified is False + assert config.hlg == "a.pt" + assert config.search_beam == 22 + assert config.output_beam == 10 + assert config.min_active_states == 10 + assert config.max_active_states == 300 + + +if __name__ == "__main__": + unittest.main() diff --git a/sherpa/python/test/test_offline_recognizer.py b/sherpa/python/test/test_offline_recognizer.py new file mode 100755 index 000000000..e25f04853 --- /dev/null +++ b/sherpa/python/test/test_offline_recognizer.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +# flake8: noqa +# To run this single test, use +# +# ctest --verbose -R test_offline_recognizer_py + +import unittest +from pathlib import Path + +import sherpa + + +d = "/tmp/icefall-models" +# Please refer to +# https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_ctc.html +# and +# https://k2-fsa.github.io/sherpa/cpp/pretrained_models/offline_transducer.html +# to download pre-trained models for testing +class TestOfflineRecognizer(unittest.TestCase): + def test_icefall_ctc_model(self): + nn_model = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt" + tokens = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/tokens.txt" + wave1 = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav" + wave2 = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav" + + if not Path(nn_model).is_file(): + print("skipping test_icefall_ctc_model()") + return + + print() + print("test_icefall_ctc_model()") + + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s1, s2]) + print(s1.result) + print(s2.result) + + def test_icefall_ctc_model_hlg_decoding(self): + nn_model = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt" + tokens = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/tokens.txt" + hlg = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt" + wave1 = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav" + wave2 = f"{d}/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav" + + if not Path(nn_model).is_file(): + print("skipping test_icefall_ctc_model_hlg_decoding()") + return + print() + print("test_icefall_ctc_model_hlg_decoding()") + + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + ctc_decoder_config = sherpa.OfflineCtcDecoderConfig(hlg=hlg) + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + feat_config=feat_config, + ctc_decoder_config=ctc_decoder_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s1, s2]) + print(s1.result) + print(s2.result) + + def test_wenet_ctc_model(self): + nn_model = f"{d}/wenet-english-model/final.zip" + tokens = f"{d}/wenet-english-model/units.txt" + wave1 = f"{d}/wenet-english-model/test_wavs/1089-134686-0001.wav" + wave2 = f"{d}/wenet-english-model/test_wavs/1221-135766-0001.wav" + + if not Path(nn_model).is_file(): + print("skipping test_wenet_ctc_model()") + return + print() + print("------test_wenet_ctc_model()------") + + # models from wenet expect un-normalized audio samples + feat_config = sherpa.FeatureConfig(normalize_samples=False) + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s1, s2]) + print(s1.result) + print(s2.result) + + def test_torchaudio_wav2vec2_0_ctc_model(self): + nn_model = f"{d}/wav2vec2.0-torchaudio/wav2vec2_asr_base_960h.pt" + tokens = f"{d}/wav2vec2.0-torchaudio/tokens.txt" + wave1 = f"{d}/wav2vec2.0-torchaudio/test_wavs/1089-134686-0001.wav" + wave2 = f"{d}/wav2vec2.0-torchaudio/test_wavs/1221-135766-0001.wav" + + if not Path(nn_model).is_file(): + print("skipping test_torchaudio_wav2vec2_0_ctc_model()") + return + + print() + print("test_torchaudio_wav2vec2_0_ctc_model()") + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s1, s2]) + print(s1.result) + print(s2.result) + + def test_nemo_ctc_en_model(self): + # Please download the model from + # https://huggingface.co/csukuangfj/sherpa-nemo-ctc-en-citrinet-512 + nn_model = f"{d}/sherpa-nemo-ctc-en-citrinet-512/model.pt" + tokens = f"{d}/sherpa-nemo-ctc-en-citrinet-512/tokens.txt" + + wave0 = f"{d}/sherpa-nemo-ctc-en-citrinet-512/test_wavs/0.wav" + wave1 = f"{d}/sherpa-nemo-ctc-en-citrinet-512/test_wavs/1.wav" + wave2 = f"{d}/sherpa-nemo-ctc-en-citrinet-512/test_wavs/2.wav" + + if not Path(nn_model).is_file(): + print("skipping test_nemo_ctc_en_model()") + return + + print() + print("test_nemo_ctc_en_model()") + + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + # The following option is very important. + # + # For the model we are using for testing, it is trained + # with per_feature in its preprocessor + feat_config.nemo_normalize = "per_feature" + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s0 = recognizer.create_stream() + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s0.accept_wave_file(wave0) + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s0, s1, s2]) + print(s0.result) + print(s1.result) + print(s2.result) + + def test_nemo_ctc_zh_model(self): + # Please download the model from + # https://huggingface.co/csukuangfj/sherpa-nemo-ctc-zh-citrinet-512 + nn_model = f"{d}/sherpa-nemo-ctc-zh-citrinet-512/model.pt" + tokens = f"{d}/sherpa-nemo-ctc-zh-citrinet-512/tokens.txt" + + wave0 = f"{d}/sherpa-nemo-ctc-zh-citrinet-512/test_wavs/0.wav" + wave1 = f"{d}/sherpa-nemo-ctc-zh-citrinet-512/test_wavs/1.wav" + wave2 = f"{d}/sherpa-nemo-ctc-zh-citrinet-512/test_wavs/2.wav" + + if not Path(nn_model).is_file(): + print("skipping test_nemo_ctc_zh_model()") + return + + print() + print("test_nemo_ctc_zh_model()") + + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + # The following option is very important. + # + # For the model we are using for testing, it is trained + # with per_feature in its preprocessor + feat_config.nemo_normalize = "per_feature" + + # The vocabulary size is very large, e.g., 5207, so + # we use a modified_ctc_topo here + ctc_decoder_config = sherpa.OfflineCtcDecoderConfig(modified=True) + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + ctc_decoder_config=ctc_decoder_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s0 = recognizer.create_stream() + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s0.accept_wave_file(wave0) + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s0, s1, s2]) + print(s0.result) + print(s1.result) + print(s2.result) + + def test_icefall_transducer_model(self): + nn_model = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14/exp/cpu_jit.pt" + tokens = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14/data/lang_bpe_500/tokens.txt" + wave1 = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14/test_wavs/1089-134686-0001.wav" + wave2 = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14/test_wavs/1221-135766-0001.wav" + + if not Path(nn_model).is_file(): + print("skipping test_icefall_transducer_model()") + return + + print() + print("test_icefall_transducer_model()") + + feat_config = sherpa.FeatureConfig() + + feat_config.fbank_opts.frame_opts.samp_freq = 16000 + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + config = sherpa.OfflineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + ) + + recognizer = sherpa.OfflineRecognizer(config) + + s1 = recognizer.create_stream() + s2 = recognizer.create_stream() + + s1.accept_wave_file(wave1) + s2.accept_wave_file(wave2) + + recognizer.decode_streams([s1, s2]) + print(s1.result) + print(s2.result) + + +if __name__ == "__main__": + unittest.main() diff --git a/sherpa/python/test/test_offline_recognizer_config.py b/sherpa/python/test/test_offline_recognizer_config.py new file mode 100755 index 000000000..386b730f8 --- /dev/null +++ b/sherpa/python/test/test_offline_recognizer_config.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# To run this single test, use +# +# ctest --verbose -R test_offline_recognizer_config_py + +import unittest + +import sherpa + + +class TestOfflineRecognizerConfig(unittest.TestCase): + def test_constructor(self): + config = sherpa.OfflineRecognizerConfig(nn_model="a.pt", tokens="b.txt") + print() + print(config) + + +if __name__ == "__main__": + unittest.main() diff --git a/sherpa/python/test/test_online_endpoint.py b/sherpa/python/test/test_online_endpoint.py deleted file mode 100755 index dd45fcf52..000000000 --- a/sherpa/python/test/test_online_endpoint.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# To run this single test, use -# -# ctest --verbose -R test_online_endpoint_py - -import argparse -import sys -import unittest - -import sherpa - - -class TestOnlineEndpoint(unittest.TestCase): - def test_rule1(self): - sys.argv = [ - "--endpoint.rule1.must-contain-nonsilence=false", - "--endpoint.rule1.min-trailing-silence=1.0", - "--endpoint.rule1.min-utterance-length=0.0", - ] - online_endpoint_parser = sherpa.add_online_endpoint_arguments() - parser = argparse.ArgumentParser( - parents=[online_endpoint_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - _, args = ( - parser.parse_args(), - online_endpoint_parser.parse_known_args()[0], - ) - args = vars(args) - config = sherpa.OnlineEndpointConfig.from_args(args) - - # decoded nothing, 5 seconds of trailing silence - t = sherpa.endpoint_detected( - config, - num_frames_decoded=500, - trailing_silence_frames=500, - frame_shift_in_seconds=0.01, - ) - assert t is True - - # decoded something, 0.5 second of trailing silence - f = sherpa.endpoint_detected( - config, - num_frames_decoded=500, - trailing_silence_frames=50, - frame_shift_in_seconds=0.01, - ) - assert f is False - - def test_rule2(self): - sys.argv = [ - "--endpoint.rule2.must-contain-nonsilence=true", - "--endpoint.rule2.min-trailing-silence=1.0", - "--endpoint.rule2.min-utterance-length=0.0", - ] - online_endpoint_parser = sherpa.add_online_endpoint_arguments() - parser = argparse.ArgumentParser( - parents=[online_endpoint_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - _, args = ( - parser.parse_args(), - online_endpoint_parser.parse_known_args()[0], - ) - args = vars(args) - config = sherpa.OnlineEndpointConfig.from_args(args) - - # decoded nothing, 3 seconds of trailing silence - r = sherpa.endpoint_detected( - config, - num_frames_decoded=300, - trailing_silence_frames=300, - frame_shift_in_seconds=0.01, - ) - assert r is False - - # decoded something, 0.5 second of trailing silence - s = sherpa.endpoint_detected( - config, - num_frames_decoded=500, - trailing_silence_frames=50, - frame_shift_in_seconds=0.01, - ) - assert s is False - - # decoded something, 1.01 seconds of trailing silence - t = sherpa.endpoint_detected( - config, - num_frames_decoded=500, - trailing_silence_frames=101, - frame_shift_in_seconds=0.01, - ) - assert t is True - - def test_rule3(self): - sys.argv = [ - "--endpoint.rule3.must-contain-nonsilence=false", - "--endpoint.rule3.min-trailing-silence=0.0", - "--endpoint.rule3.min-utterance-length=13.0", - ] - online_endpoint_parser = sherpa.add_online_endpoint_arguments() - parser = argparse.ArgumentParser( - parents=[online_endpoint_parser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - _, args = ( - parser.parse_args(), - online_endpoint_parser.parse_known_args()[0], - ) - args = vars(args) - config = sherpa.OnlineEndpointConfig.from_args(args) - - # decoded nothing, 0.1 second of trailing silence - r = sherpa.endpoint_detected( - config, - num_frames_decoded=1200, - trailing_silence_frames=10, - frame_shift_in_seconds=0.01, - ) - assert r is False - - # decoded something, 0.1 second of trailing silence - s = sherpa.endpoint_detected( - config, - num_frames_decoded=1300, - trailing_silence_frames=10, - frame_shift_in_seconds=0.01, - ) - assert s is False - - # decoded something, 0.1 seconds of trailing silence - t = sherpa.endpoint_detected( - config, - num_frames_decoded=1301, - trailing_silence_frames=10, - frame_shift_in_seconds=0.01, - ) - assert t is True - - -if __name__ == "__main__": - unittest.main() diff --git a/sherpa/python/test/test_online_recognizer.py b/sherpa/python/test/test_online_recognizer.py new file mode 100755 index 000000000..237a2c512 --- /dev/null +++ b/sherpa/python/test/test_online_recognizer.py @@ -0,0 +1,932 @@ +#!/usr/bin/env python3 +# flake8: noqa +# To run this single test, use +# +# ctest --verbose -R test_online_recognizer_py + +import unittest +import wave +from pathlib import Path + +import torch +import torchaudio + +import sherpa + + +def decode( + recognizer: sherpa.OnlineRecognizer, + s: sherpa.OnlineStream, + samples: torch.Tensor, +): + expected_sample_rate = 16000 + + tail_padding = torch.zeros(int(16000 * 0.3), dtype=torch.float32) + + chunk = int(0.2 * expected_sample_rate) # 0.2 seconds + + start = 0 + last_result = "" + while start < samples.numel(): + end = start + chunk + s.accept_waveform(expected_sample_rate, samples[start:end]) + start = end + + while recognizer.is_ready(s): + recognizer.decode_stream(s) + result = recognizer.get_result(s).text + if last_result != result: + last_result = result + print(result) + + s.accept_waveform(expected_sample_rate, tail_padding) + s.input_finished() + + while recognizer.is_ready(s): + recognizer.decode_stream(s) + result = recognizer.get_result(s).text + if last_result != result: + last_result = result + print(result) + + +d = "/tmp/icefall-models" +# Please refer to +# https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html +# to download pre-trained models for testing +class TestOnlineRecognizer(unittest.TestCase): + def test_icefall_asr_librispeech_conv_emformer_transducer_stateless2_2022_07_05( + self, + ): + nn_model = f"{d}/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/cpu-jit-epoch-30-avg-10-torch-1.10.0.pt" + tokens = f"{d}/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt" + lg = f"{d}/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/LG.pt" + wave = f"{d}/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print( + "skipping test_icefall_asr_librispeech_conv_emformer_transducer_stateless2_2022_07_05()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print( + "--------------------fast beam search with LG--------------------" + ) + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + lg=lg, + ngram_lm_scale=0.01, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming( + self, + ): + nn_model = f"{d}/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp/cpu_jit_epoch_7_avg_1_torch.1.7.1.pt" + tokens = f"{d}/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/data/lang_char/tokens.txt" + lg = f"{d}/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/data/lang_char/LG.pt" + wave = f"{d}/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/test_wavs/DEV_T0000000000.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print( + "skipping test_icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print( + "--------------------fast beam search with LG--------------------" + ) + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + lg=lg, + ngram_lm_scale=0.01, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_icefall_asr_conv_emformer_transducer_stateless2_zh( + self, + ): + nn_model = f"{d}/icefall-asr-conv-emformer-transducer-stateless2-zh/exp/cpu_jit-epoch-11-avg-1.pt" + tokens = f"{d}/icefall-asr-conv-emformer-transducer-stateless2-zh/data/lang_char_bpe/tokens.txt" + wave = f"{d}/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/test_wavs/DEV_T0000000000.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print( + "skipping test_icefall_asr_librispeech_conv_emformer_transducer_stateless2_2022_07_05()" + ) + return + + if not Path(wave).is_file(): + print(f"{wave} does not exist") + print( + "skipping test_icefall_asr_librispeech_conv_emformer_transducer_stateless2_2022_07_05()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_icefall_librispeech_streaming_pruned_transducer_stateless4_20220625( + self, + ): + nn_model = f"{d}/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625/exp/cpu_jit-epoch-25-avg-3.pt" + tokens = f"{d}/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625/data/lang_bpe_500/tokens.txt" + lg = f"{d}/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625/data/lang_bpe_500/LG.pt" + wave = f"{d}/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625/test_waves/1089-134686-0001.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print( + "skipping test_icefall_librispeech_streaming_pruned_transducer_stateless4_20220625()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print( + "--------------------fast beam search with LG--------------------" + ) + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + lg=lg, + ngram_lm_scale=0.01, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + left_context=64, + right_context=0, + chunk_size=12, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_cefall_asr_librispeech_lstm_transducer_stateless2_2022_09_03( + self, + ): + encoder_model = f"{d}/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-iter-468000-avg-16.pt" + decoder_model = f"{d}/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-iter-468000-avg-16.pt" + joiner_model = f"{d}/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-iter-468000-avg-16.pt" + + tokens = f"{d}/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/tokens.txt" + lg = f"{d}/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/LG.pt" + wave = f"{d}/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav" + + if not Path(encoder_model).is_file(): + print(f"{encoder_model} does not exist") + print( + "skipping test_icefall_librispeech_streaming_pruned_transducer_stateless4_20220625()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model="", + encoder_model=encoder_model, + decoder_model=decoder_model, + joiner_model=joiner_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model="", + encoder_model=encoder_model, + decoder_model=decoder_model, + joiner_model=joiner_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model="", + encoder_model=encoder_model, + decoder_model=decoder_model, + joiner_model=joiner_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print( + "--------------------fast beam search with LG--------------------" + ) + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + lg=lg, + ngram_lm_scale=0.01, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model="", + encoder_model=encoder_model, + decoder_model=decoder_model, + joiner_model=joiner_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_icefall_asr_librispeech_pruned_stateless_emformer_rnnt2_2022_06_01( + self, + ): + nn_model = f"{d}/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/exp/cpu_jit-epoch-39-avg-6-use-averaged-model-1.pt" + tokens = f"{d}/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/tokens.txt" + lg = f"{d}/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/data/lang_bpe_500/LG.pt" + wave = f"{d}/icefall-asr-librispeech-pruned-stateless-emformer-rnnt2-2022-06-01/test_wavs/1089-134686-0001.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print( + "skipping test_icefall_asr_librispeech_conv_emformer_transducer_stateless2_2022_07_05()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print( + "--------------------fast beam search with LG--------------------" + ) + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + lg=lg, + ngram_lm_scale=0.01, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_k2fsa_zipformer_chinese_english_mixed(self): + nn_model = f"{d}/k2fsa-zipformer-chinese-english-mixed/exp/cpu_jit.pt" + tokens = f"{d}/k2fsa-zipformer-chinese-english-mixed/data/lang_char_bpe/tokens.txt" + wave = f"{d}/k2fsa-zipformer-chinese-english-mixed/test_wavs/0.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print("skipping test_k2fsa_zipformer_chinese_english_mixed()") + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + def test_icefall_asr_librispeech_pruned_transducer_stateless7_streaming_2022_12_29( + self, + ): + nn_model = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/cpu_jit.pt" + tokens = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt" + lg = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/LG.pt" + wave = f"{d}/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav" + + if not Path(nn_model).is_file(): + print(f"{nn_model} does not exist") + print( + "skipping test_icefall_asr_librispeech_pruned_transducer_stateless7_streaming_2022_12_29()" + ) + return + + feat_config = sherpa.FeatureConfig() + expected_sample_rate = 16000 + + samples, sample_rate = torchaudio.load(wave) + assert sample_rate == expected_sample_rate, ( + sample_rate, + expected_sample_rate, + ) + samples = samples.squeeze(0) + + feat_config.fbank_opts.frame_opts.samp_freq = expected_sample_rate + feat_config.fbank_opts.mel_opts.num_bins = 80 + feat_config.fbank_opts.mel_opts.high_freq = -400 + feat_config.fbank_opts.frame_opts.dither = 0 + + print("--------------------greedy search--------------------") + + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="greedy_search", + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + print("--------------------modified beam search--------------------") + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="modified_beam_search", + num_active_paths=4, + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print("--------------------fast beam search--------------------") + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + print( + "--------------------fast beam search with LG--------------------" + ) + fast_beam_search_config = sherpa.FastBeamSearchConfig( + beam=20.0, + max_states=64, + max_contexts=8, + allow_partial=True, + lg=lg, + ngram_lm_scale=0.01, + ) + config = sherpa.OnlineRecognizerConfig( + nn_model=nn_model, + tokens=tokens, + use_gpu=False, + feat_config=feat_config, + decoding_method="fast_beam_search", + fast_beam_search_config=fast_beam_search_config, + chunk_size=32, + ) + + recognizer = sherpa.OnlineRecognizer(config) + + s = recognizer.create_stream() + + decode(recognizer=recognizer, s=s, samples=samples) + + +torch.set_num_threads(1) +torch.set_num_interop_threads(1) +torch._C._jit_set_profiling_executor(False) +torch._C._jit_set_profiling_mode(False) +torch._C._set_graph_executor_optimize(False) +if __name__ == "__main__": + unittest.main() diff --git a/sherpa/python/test/test_online_recognizer_config.py b/sherpa/python/test/test_online_recognizer_config.py new file mode 100755 index 000000000..8f3de52b3 --- /dev/null +++ b/sherpa/python/test/test_online_recognizer_config.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# To run this single test, use +# +# ctest --verbose -R test_online_recognizer_config_py + +import unittest + +import sherpa + + +class TestOnlineRecognizerConfig(unittest.TestCase): + def test_constructor(self): + config = sherpa.OnlineRecognizerConfig(nn_model="a.pt", tokens="b.txt") + print() + print(config) + + +if __name__ == "__main__": + unittest.main() diff --git a/sherpa/python/test/test_utils.py b/sherpa/python/test/test_utils.py deleted file mode 100755 index 16832605d..000000000 --- a/sherpa/python/test/test_utils.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# To run this single test, use -# -# ctest --verbose -R test_utils.py - -import unittest - -import k2 - -import sherpa - - -class TestUtils(unittest.TestCase): - def test_count_number_trailing_zeros(self): - assert sherpa.count_num_trailing_zeros([1, 2, 3]) == 0 - assert sherpa.count_num_trailing_zeros([1, 0, 3]) == 0 - - assert sherpa.count_num_trailing_zeros([1, 0, 0]) == 2 - assert sherpa.count_num_trailing_zeros([0, 0, 0]) == 3 - - def test_get_texts_and_num_trailing_blanks_case1(self): - s1 = """ - 0 1 0 0 0.0 - 1 2 1 1 0.2 - 2 3 1 5 0.2 - 3 4 -1 -1 0.0 - 4 - """ - - s2 = """ - 0 1 0 0 0.0 - 1 2 0 1 0.2 - 2 3 -1 -1 0.0 - 3 - """ - - s3 = """ - 0 1 1 0 0.0 - 1 2 0 1 0.2 - 2 3 -1 -1 0.0 - 3 - """ - - fsa1 = k2.Fsa.from_str(s1, acceptor=False) - fsa2 = k2.Fsa.from_str(s2, acceptor=False) - fsa3 = k2.Fsa.from_str(s3, acceptor=False) - - fsa = k2.Fsa.from_fsas([fsa1, fsa2, fsa3]) - ( - aux_labels, - num_trailing_blanks, - ) = sherpa.get_texts_and_num_trailing_blanks(fsa) - - assert aux_labels == [[1, 5], [1], [1]] - assert num_trailing_blanks == [0, 2, 1] - - -if __name__ == "__main__": - unittest.main() diff --git a/triton/Dockerfile/Dockerfile.client b/triton/Dockerfile/Dockerfile.client index f18f8f9b3..b9c8221a2 100755 --- a/triton/Dockerfile/Dockerfile.client +++ b/triton/Dockerfile/Dockerfile.client @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/tritonserver:22.05-py3-sdk +FROM nvcr.io/nvidia/tritonserver:22.08-py3-sdk # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html # Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue diff --git a/triton/Dockerfile/Dockerfile.server b/triton/Dockerfile/Dockerfile.server index 7066e8f45..ae8654a44 100755 --- a/triton/Dockerfile/Dockerfile.server +++ b/triton/Dockerfile/Dockerfile.server @@ -1,14 +1,24 @@ -FROM nvcr.io/nvidia/tritonserver:22.05-py3 +FROM nvcr.io/nvidia/tritonserver:24.07-py3 # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html # Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue - LABEL maintainer="NVIDIA" LABEL repository="tritonserver" -RUN apt-get update && apt-get -y install swig && apt-get -y install python3-dev && apt-get install -y cmake -RUN pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html -RUN pip3 install -v kaldifeat -RUN pip3 install sentencepiece -RUN pip3 install pyyaml +RUN apt-get update && apt-get install -y cmake +RUN python3 -m pip install k2==1.24.4.dev20240725+cuda12.4.torch2.4.0 -f https://k2-fsa.github.io/k2/cuda.html && \ + python3 -m pip install -r https://raw.githubusercontent.com/k2-fsa/icefall/master/requirements.txt && \ + pip install -U "huggingface_hub[cli]" lhotse colored onnx_graphsurgeon polygraphy +# https://github.com/k2-fsa/k2/blob/master/k2/python/k2/__init__.py#L13 delete the cuda version check +RUN sed -i '/if (/,/^ )/d' /usr/local/lib/python3.10/dist-packages/k2/__init__.py WORKDIR /workspace -COPY ./scripts scripts + +RUN git clone https://github.com/csukuangfj/kaldifeat && \ + cd kaldifeat && \ + sed -i 's/in running_cuda_version//g' get_version.py && \ + python3 setup.py install && \ + cd - + +RUN git clone https://github.com/k2-fsa/icefall.git +ENV PYTHONPATH "${PYTHONPATH}:/workspace/icefall" + +COPY ./scripts scripts \ No newline at end of file diff --git a/triton/README.md b/triton/README.md index 8175d4298..8be2fc407 100755 --- a/triton/README.md +++ b/triton/README.md @@ -1,260 +1,129 @@ -# Inference Serving Practice for Transducer Non-streaming ASR based on Icefall +# Inference Serving Best Practice for Transducer ASR based on Icefall -In this tutorial, we'll go through how to run a non-streaming (offline) ASR Transducer model trained by [Icefall](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3) **on GPUs**, and deploy it as service with NVIDIA [Triton Inference Server](https://github.com/triton-inference-server/server). +In this tutorial, we'll go through how to run non-streaming (offline) and streaming ASR Transducer models trained by [Icefall](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3) **on GPUs**, and deploy it as service with NVIDIA [Triton Inference Server](https://github.com/triton-inference-server/server). ## Table of Contents - [Preparation](#preparation) - [Prepare Environment](#prepare-environment) - - [Prepare Models](#prepare-models) - [Deploy on Triton Inference Server](#deploy-on-triton-inference-server) - [Quick Start](#quick-start) - - [Advanced](#advanced) - - [Specify which GPUs for deployment](#specify-which-gpus-for-deployment) - - [Set the number of model instances per GPU](#set-the-number-of-model-instances-per-gpu) - - [Set the ports exposed by server](#set-the-ports-exposed-by-server) - - [Set the request scheduler](#set-the-request-scheduler) -- [Inference Client](#inference-client) - - [Quick Start](#quick-start-1) - - [Performance Test](#performance-test) -- [Benchmarks](#benchmarks) - - [FP32 Performance on Single GPU](#fp32-performance-on-single-gpu) - - [FP32 Performance of Small Model for Librispeech](#fp32-performance-of-small-model-for-librispeech) - - [Reference Accuracy](#reference-accuracy) +- [Inference Client](#benchmark-using-dataset) +- [Using TensorRT acceleration](#using-tensorrt-acceleration) + - [TRT Quick start](#trt-quick-start) + - [Benchmark for Conformer TRT encoder vs ONNX](#benchmark-for-conformer-trt-encoder-vs-onnx) + ## Preparation -First of all, we need to get environment, models, codes, and data ready. +First of all, we need to get environment, models ready. ### Prepare Environment Clone the repository: ```bash -# Clond Icefall repo -git clone https://github.com/k2-fsa/icefall.git -cd icefall -export ICEFALL_DIR=$PWD - -# Clone k2 sherpa repo +# Clone Sherpa repo git clone https://github.com/k2-fsa/sherpa.git cd sherpa export SHERPA_SRC=$PWD - -``` - -Next, install dependencies (you need to install [Conda](https://docs.conda.io/en/latest/miniconda.html) before going ahead): - ``` -TODO -``` - -### Prepare Models - -``` -# Download pretrained models -git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29 - -# export them to three jit models: encoder_jit.pt, decoder_jit.pt, joiner_jit.pt -cp $SHERPA/triton/scripts/conformer_triton.py $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3/ -cp $SHERPA/triton/scripts/export_jit.py $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3/ - -cd $ICEFALL_DIR/egs/librispeech/ASR/pruned_stateless_transducer3 -python3 export_jit.py --pretrained-model --output-dir --bpe-model - -# copy bpe file to , later we would mount to the triton docker container -cp -``` - - -## Deploy on Triton Inference Server - -Now we have exported pretrained non-streaming ASR model, then we need to consider how to deploy the model on the server as non-streaming ASR service, to allow users to send audio requests and get recognition results. Actually, [Triton Inference Server](https://github.com/triton-inference-server/server) does the most of serving work for us, it handles requests/results sending and receiving, request scheduling, load balance, and inference execution. In this part, we'll go through how to deploy the model on Triton. +We highly recommend you to use docker containers to save your life. Build the server docker image: ``` +cd $SHERPA_SRC/triton docker build . -f Dockerfile/Dockerfile.server -t sherpa_triton_server:latest --network host ``` - -Build the client docker image: +Alternatively, you could directly pull the pre-built image based on tritonserver image. ``` -docker build . -f Dockerfile/Dockerfile.client -t sherpa_triton_client:latest --network host +docker pull soar97/triton-k2:24.07 ``` -The model repository is provided in `model_repo` directory, you can find directories standing for each of the components. And there is a `conformer_transducer` dir which ensemble all the components into a whole pipeline. Each of those component directory contains a config file `config.pbtxt` and a version directory containing the model file. However, the version directories of encoder and decoder are still empty since we have not put the exported models into them. - -### Quick Start - -Now start server: - -```bash -# Start the docker container -docker run --gpus all -v $PWD/model_repo:/ws/model_repo -v :/ws/jit_model/ --name sherpa_server --net host --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -it sherpa_triton_server:latest - -# Inside the docker container -bash /workspace/scripts/start_server.sh -``` - -If you meet any issues during the process, please file an issue. - -### Advanced - -Here we introduce some advanced configuration/options for deploying the ASR server. - -#### Specify which GPUs for deployment - -If you have multiple GPUs on the server machine, you can specify which GPUs will be used for deploying ASR service. To do so, just change the `-e CUDA_VISIBLE_DEVICES=` option or just use to specify which GPU to make visible in the container when starting server container. - -For example, if you just want to use GPU 1, 2, 3 for deployment, then use the following options to start the server: - +Start the docker container: ```bash -docker run --gpus '"device=1,2,3"' -v $PWD/model_repo:/ws/model_repo -v :/ws/jit_model/ --name sherpa_server --net host --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -it sherpa_triton_server:latest -``` - -#### Set the number of model instances per GPU - -Triton can provide multiple [instances of a model](https://github.com/triton-inference-server/server/blob/master/docs/architecture.md#concurrent-model-execution) so that multiple inference requests for that model can be handled simultaneously. You can set the number of model instances on each GPU by modifying the `config.pbtxt` file of the any of the component model in the `model_repo`, as [Triton document](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#instance-groups) says. - -For example, if you want to set 4 model instances for encoder component on each GPU (given that each GPU memory is large enough to handle to instances), just edit the following lines of config file model_repo/encoder/config.pbtxt`: - -``` -instance_group [ - { - count: 4 - kind: KIND_GPU - } -] +docker run --gpus all -v $SHERPA_SRC:/workspace/sherpa --name sherpa_server --net host --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -it soar97/triton-k2:24.07 ``` +Now, you should enter into the container successfully. -Elsewise, if you want to set only 1 model instance on each GPU, just change the `count:` field to `1`. -You can also specify which GPUs to use for deployment, and how many model instances running on those GPUs. For example, you can set 1 execution instance on GPU 0, 2 execution instances on GPU 1 and 2: - -``` -instance_group [ - { - count: 1 - kind: KIND_GPU - gpus: [ 0 ] - }, - { - count: 2 - kind: KIND_GPU - gpus: [ 1, 2 ] - } - ] -``` - -#### Set the ports exposed by server - -The default ports exposed by server to clients is: port 8000 for HTTP inference service; port 8001 for gRPC inference service; port 8002 for Metrics service. If the default ports are occupied by other services, you can change the ports that are exposed to the clients, by specify `-p` option when starting the server. - -For example, if you want to set port 8003 for HTTP inference service, 8004 for gRPC inference service, and 8005 for Metrics service, then use the following command to start the server: - -```bash -docker run --gpus all -v $PWD/model_repo:/ws/model_repo -v :/ws/jit_model/ --name sherpa_server --net host --shm-size=1g --ulimit memlock=-1 -p 8003:8003 -p 8004:8004 -p 8005:8005 --ulimit stack=67108864 -it sherpa_triton_server:latest -``` - -And then add the following options when start the Triton server: - -```bash -tritonserver --model-repository=/ws/model_repo --http-port=8003 --grpc-port=8004 --metrics-port=8005 -``` - -Please note that: if you change the exposed ports of server, you should also specify the same ports when sending requests via client program. For how to specify server port for sending requests. - -#### Set the request scheduler - -With Triton, you can choose various scheduler modes and batching strategies for coming requests, as described in [Triton document](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#scheduling-and-batching). In our project, we use [dynamic batcher](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#dynamic-batcher) by default, which allows inference requests to be combined by the server, so that a batch is created dynamically. - -You can change the settings of the dynamic batcher by editting `config.pbtxt` file of each model: - -``` -dynamic_batching { - preferred_batch_size: [8, 16, 32, 64] - max_queue_delay_microseconds: 10000 -} -``` - -The [preferred_batch_size](https://github.com/triton-inference-server/server/blob/master/docs/model_configuration.md#preferred-batch-sizes) property indicates the batch sizes that the dynamic batcher should attempt to create. For example, the above configuration enables dynamic batching with preferred batch sizes of 8, 16, 32 and 64. - -The `max_queue_delay_microseconds` property setting changes the dynamic batcher behavior when a batch of a preferred size cannot be created. When a batch of a preferred size cannot be created from the available requests, the dynamic batcher will delay sending the batch as long as no request is delayed longer than the configured max_queue_delay_microseconds value. If a new request arrives during this delay and allows the dynamic batcher to form a batch of a preferred batch size, then that batch is sent immediately for inferencing. If the delay expires the dynamic batcher sends the batch as is, even though it is not a preferred size. +## Deploy on Triton Inference Server -## Inference Client +In this part, we'll go through how to deploy the model on Triton. -In this section, we will show how to send requests to our deployed non-streaming ASR service, and receive the recognition results. Also, we can use client to test the accuracy of the ASR service on a test dataset. In addition, we can use [perf_analyzer](https://github.com/triton-inference-server/server/blob/main/docs/perf_analyzer.md) provided by Triton to test the performance of the service. +The model repositories are provided in `model_repo_offline` and `model_repo_streaming` directory, you can find directories standing for each of the components. And there is a `transducer` dir which ensembles all the components into a whole pipeline. Each of those component directories contains a config file `config.pbtxt` and a version directory containing the model file. ### Quick Start -We do it in the built client container, now let's start the container. +Now start server: ```bash -docker run -ti --net host --name sherpa_client -v $PWD/client:/ws/client sherpa_triton_client:latest -cd /ws/client -``` - -In the docker container, run the client script to do ASR inference. +# Inside the docker container +# If you want to use greedy search decoding +cd /Your_SHERPA_SRC/triton/ +apt-get install git-lfs +pip3 install -r ./requirements.txt +export CUDA_VISIBLE_DEVICES="your_gpu_id" -```bash -# Test one audio -python3 client.py --audio_file=./test_wavs/1089-134686-0001.wav --url=localhost:8001 +bash scripts/build_wenetspeech_zipformer_offline_trt.sh ``` -The above command sends a single audio `1089-134686-0001.wav` to the server and get the result. `--url` option specifies the IP and port of the server, in our example, we set the server and client on the same machine, therefore IP is `localhost`, and we use port `8001` since it is the default port for gRPC in Triton. But if your client is not on the same machine as the server, you should change this option. - -You can also test a bunch of audios together with the client. Just specify the path of `wav.scp` with `--wavscp` option, set the path of test set directory with `--data_dir` option, and set the path of ground-truth transcript file with `--trans` option, the client will infer all the audios in test set and calculate the CER upon the test set. - -```bash -# Test a bunch of audios -python3 client.py --wavscp=./test_wavs/wav.scp --data_dir=./test_wavs/ --trans=./test_wavs/trans.txt +## Benchmark using Dataset +```sh +git clone https://github.com/yuekaizhang/Triton-ASR-Client.git +cd Triton-ASR-Client +pip3 install -r requirements.txt +num_task=16 +python3 client.py \ + --server-addr localhost \ + --model-name whisper \ + --num-tasks $num_task \ + --whisper-prompt "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \ + --manifest-dir ./datasets/aishell1_test ``` -### Performance Test +## Using TensorRT acceleration -We use [perf_analyzer](https://github.com/triton-inference-server/server/blob/main/docs/perf_analyzer.md) to test the performance of non-streaming ASR service. Before this, we need to generate the test input data. +### TRT Quick start -Still in the container client, run: +You can directly use the following script to export TRT engine and start Triton server for Conformer Offline model: ```bash -cd /ws/client -python3 generate_perf_input.py test_wavs/1089-134686-0001.wav +bash scripts/build_librispeech_pruned_transducer_stateless3_offline_trt.sh ``` -The first argument is the path of audio file used for testing, and the second argument is the output json file loaded by perf_analyzer. +### Export to TensorRT -Then in client docker container, we can simply run the testing: +If you want to build TensorRT for your own service, you can try the following steps: -```bash -perf_analyzer -m conformer_transducer -b 1 -a -p 20000 --concurrency-range 100:200:50 -i gRPC --input-data= -u localhost:8001 -``` +#### Model export -Where: -- `-m` option indicates the name of the served model; -- `-p` option is the mearsurement window, which indicates in what time duration to collect the metrics; -- `-v` option turns on the verbose model; -- `-i` option is for choosing the networking protocol, you can choose `HTTP` or `gRPC` here; -- `-u` option sets the url of the service in the form of `:`, but notice that port `8000` corresponds to HTTP protocol while port `8001` corresponds to gRPC protocol; -- `-b` option indicates the batch size of the input requests used fo testing; since we simulate individual users sending requests, we set batch size here to `1`; -- `-a` option controls the analyzer to send requests in an asynchronous way, if this option is not applied, the requests will be sent in synchronous way; -- `--input-data` option points to the path of the json file containing the real input data -- `--concurrency-range` option is an important one, it indicates the concurrency of the requests which defines the pressure we will give to the server. -- You can also set `-f` option to set the path of testing result file; -- You can also set `--max-threads` option to set the number of threads used to send test request, it should be set to the number of CPU cores in your test machine. - -As described above, if you want to send request with batch size > 1: +You have to prepare the ONNX model by referring [here](https://icefall.readthedocs.io/en/latest/model-export/export-onnx.html#export-the-model-to-onnx) to export your models into ONNX format. Assume you have put your ONNX model in the `$model_dir` directory. +Then, just run the command: ```bash -perf_analyzer -m attention_rescoring -b 16 -a -p20000 --concurrency-range 100:200:50 -i gRPC --input-data=./input.json -u localhost:8001 -``` - -## Benchmarks - -In this section, we show a reference performance benchamrk for the non-streaming ASR based on Icefall. - -Notice that we send the test requests all with same length, so that Triton server will pack several requests into a batch. But in the real production cases, the lengths of all the incoming requests will not be the same, therefore, in order to allow Triton batch the requests to improve throughput, you need to try to padding the input to a specific length at the client side. - -### FP32 Performance on Single GPU - -First we give the performance benchmark of FP32 precision tested on single T4. We use test audios with two different lengths, 5 seconds, 8 seconds and 10 seconds. And test requests are sent with batch size 1. - -#### FP32 Performance of Small Model (80Mb parameters) for Librispeech -TODO \ No newline at end of file +# First, use polygraphy to simplify the onnx model. +polygraphy surgeon sanitize $model_dir/encoder.onnx --fold-constant -o encoder.trt +# Using /usr/src/tensorrt/bin/trtexec tool in the tritonserver docker image. +bash scripts/build_trt.sh 16 $model_dir/encoder.onnx model_repo_offline/encoder/1/encoder.trt +``` + +The generated TRT model will be saved into `model_repo_offline/encoder/1/encoder.trt`. +Then you can start the Triton server. + + +### Benchmark for Conformer TRT encoder vs ONNX + +| Model | Batch size| Avg latency(ms) | QPS | +|--------|-----------|-----------------|----------| +| ONNX | 1 | 7.44 | 134.48 | +| | 8 | 14.92 | 536.09 | +| | 16 | 22.84 | 700.67 | +| | 32 | 41.62 | 768.84 | +| | 64 | 80.48 | 795.27 | +| | 128 | 171.97 | 744.32 | +| TRT | 1 | 5.21834 | 193.93 | +| | 8 | 11.7826 | 703.49 | +| | 16 | 20.4444 | 815.79 | +| | 32 | 37.583 | 893.56 | +| | 64 | 69.8312 | 965.40 | +| | 128 | 139.702 | 964.57 | diff --git a/triton/client/client.py b/triton/client/client.py deleted file mode 100755 index d035cdba7..000000000 --- a/triton/client/client.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import multiprocessing -from multiprocessing import Pool - -import argparse -import os -import tritonclient.grpc as grpcclient -from utils import write_error_stats -from offline_client import SpeechClient -import numpy as np - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - default='localhost:8001', - help='Inference server URL. Default is ' - 'localhost:8001.') - parser.add_argument('--model_name', - required=False, - default='conformer_transducer', - help='the model to send request to') - parser.add_argument('--wavscp', - type=str, - required=False, - default=None, - help='audio_id \t absolute_wav_path') - parser.add_argument('--trans', - type=str, - required=False, - default=None, - help='audio_id \t text') - parser.add_argument('--data_dir', - type=str, - required=False, - default=None, - help='data dir will be append to audio file if given') - parser.add_argument( - '--audio_file', - type=str, - required=False, - default=None, - help='single wav file' - ) - - parser.add_argument( - '--errs_file', - type=str, - required=False, - default="./wer_results.txt", - help='output of wer anaylasis' - ) - - FLAGS = parser.parse_args() - - # load data - filenames = [] - transcripts = [] - if FLAGS.audio_file is not None: - path = FLAGS.audio_file - if FLAGS.data_dir: - path = os.path.join(FLAGS.data_dir, path) - if os.path.exists(path): - filenames = [path] - elif FLAGS.wavscp is not None: - audio_data = {} - with open(FLAGS.wavscp, "r", encoding="utf-8") as f: - for line in f: - aid, path = line.strip().split('\t') - if FLAGS.data_dir: - path = os.path.join(FLAGS.data_dir, path) - audio_data[aid] = {'path': path} - with open(FLAGS.trans, "r", encoding="utf-8") as f: - for line in f: - aid, text = line.strip().split('\t') - audio_data[aid]['text'] = text - - for key, value in audio_data.items(): - filenames.append(value['path']) - transcripts.append(value['text']) - - num_workers = multiprocessing.cpu_count() // 2 - - def single_job(client_files): - with grpcclient.InferenceServerClient(url=FLAGS.url, - verbose=FLAGS.verbose) as triton_client: - protocol_client = grpcclient - speech_client = SpeechClient(triton_client, FLAGS.model_name, - protocol_client) - idx, audio_files = client_files - predictions = [] - for li in audio_files: - result = speech_client.recognize(li, idx) - print("Recognized {}:{}".format(li, result[0])) - predictions += result - return predictions - - # start to do inference - # Group requests in batches - predictions = [] - tasks = [] - splits = np.array_split(filenames, num_workers) - - for idx, per_split in enumerate(splits): - cur_files = per_split.tolist() - tasks.append((idx, cur_files)) - - with Pool(processes=num_workers) as pool: - predictions = pool.map(single_job, tasks) - - predictions = [item for sublist in predictions for item in sublist] - results = [] - if transcripts: - assert len(transcripts) == len(predictions) - for i in range(len(transcripts)): - results.append((transcripts[i], predictions[i])) - with open(FLAGS.errs_file, "w") as f: - wer = write_error_stats( - f, f"Testset", results, enable_log=True - ) \ No newline at end of file diff --git a/triton/client/generate_perf_input.py b/triton/client/generate_perf_input.py deleted file mode 100755 index 9b4c39955..000000000 --- a/triton/client/generate_perf_input.py +++ /dev/null @@ -1,24 +0,0 @@ -import sys -import json -import soundfile as sf -import numpy as np - - -def generate(wav_file, out): - """ - inp: a single channel, 16kHz wav file - out: the generated json test data - """ - print("Reading {}".format(wav_file)) - waveform, sample_rate = sf.read(wav_file) - batch_size = 1 - mat = np.array([waveform] * batch_size, dtype=np.float32) - - out_dict = {"data": [{"WAV_LENS": [len(waveform)], - "WAV": {"shape": [len(waveform)], "content": mat.flatten().tolist()}}]} - json.dump(out_dict, open(out, "w")) - -if __name__ == "__main__": - inp = sys.argv[1] - out = sys.argv[2] - generate(inp, out) diff --git a/triton/client/offline_client.py b/triton/client/offline_client.py deleted file mode 100755 index 79b60a599..000000000 --- a/triton/client/offline_client.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from tritonclient.utils import np_to_triton_dtype -import numpy as np -import soundfile as sf - -class SpeechClient(object): - def __init__(self, triton_client, model_name, protocol_client): - self.triton_client = triton_client - self.protocol_client = protocol_client - self.model_name = model_name - - def recognize(self, wav_file, idx=0): - waveform, sample_rate = sf.read(wav_file) - - samples = np.array([waveform], dtype=np.float32) - lengths = np.array([[len(waveform)]], dtype=np.int32) - sequence_id = 10086 + idx - result = '' - inputs = [ - self.protocol_client.InferInput("WAV", samples.shape, - np_to_triton_dtype(samples.dtype)), - self.protocol_client.InferInput("WAV_LENS", lengths.shape, - np_to_triton_dtype(lengths.dtype)) - ] - inputs[0].set_data_from_numpy(samples) - inputs[1].set_data_from_numpy(lengths) - outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")] - response = self.triton_client.infer(self.model_name, - inputs, - request_id=str(sequence_id), - outputs=outputs) - tmp = response.as_numpy("TRANSCRIPTS") - result = response.as_numpy("TRANSCRIPTS")[0].tolist() - result = b' '.join(result).decode('utf-8') - return [result] diff --git a/triton/client/test_wavs/1089-134686-0001.wav b/triton/client/test_wavs/1089-134686-0001.wav deleted file mode 100755 index bfe1519ea..000000000 Binary files a/triton/client/test_wavs/1089-134686-0001.wav and /dev/null differ diff --git a/triton/client/test_wavs/1221-135766-0001.wav b/triton/client/test_wavs/1221-135766-0001.wav deleted file mode 100755 index 498b3f335..000000000 Binary files a/triton/client/test_wavs/1221-135766-0001.wav and /dev/null differ diff --git a/triton/client/test_wavs/1221-135766-0002.wav b/triton/client/test_wavs/1221-135766-0002.wav deleted file mode 100755 index c76bac8aa..000000000 Binary files a/triton/client/test_wavs/1221-135766-0002.wav and /dev/null differ diff --git a/triton/client/test_wavs/trans.txt b/triton/client/test_wavs/trans.txt deleted file mode 100755 index d1512c624..000000000 --- a/triton/client/test_wavs/trans.txt +++ /dev/null @@ -1,3 +0,0 @@ -1089-134686-0001 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS -1221-135766-0001 GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN -1221-135766-0002 YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION diff --git a/triton/client/test_wavs/wav.scp b/triton/client/test_wavs/wav.scp deleted file mode 100755 index 3ffe2e510..000000000 --- a/triton/client/test_wavs/wav.scp +++ /dev/null @@ -1,3 +0,0 @@ -1089-134686-0001 1089-134686-0001.wav -1221-135766-0001 1221-135766-0001.wav -1221-135766-0002 1221-135766-0002.wav diff --git a/triton/client/utils.py b/triton/client/utils.py deleted file mode 100755 index d15fb6156..000000000 --- a/triton/client/utils.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang -# Mingshuang Luo) -# -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from collections import defaultdict -from typing import Dict, List, TextIO, Tuple - -import kaldialign - - -# https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py#L336 -# Directly copy from above link, would remove this copy after dockerfile update -def write_error_stats( - f: TextIO, - test_set_name: str, - results: List[Tuple[str, str]], - enable_log: bool = True, -) -> float: - """Write statistics based on predicted results and reference transcripts. - It will write the following to the given file: - - WER - - number of insertions, deletions, substitutions, corrects and total - reference words. For example:: - Errors: 23 insertions, 57 deletions, 212 substitutions, over 2606 - reference words (2337 correct) - - The difference between the reference transcript and predicted result. - An instance is given below:: - THE ASSOCIATION OF (EDISON->ADDISON) ILLUMINATING COMPANIES - The above example shows that the reference word is `EDISON`, - but it is predicted to `ADDISON` (a substitution error). - Another example is:: - FOR THE FIRST DAY (SIR->*) I THINK - The reference word `SIR` is missing in the predicted - results (a deletion error). - results: - An iterable of tuples. The first element is the reference transcript - while the second element is the predicted result. - enable_log: - If True, also print detailed WER to the console. - Otherwise, it is written only to the given file. - Returns: - Return None. - """ - subs: Dict[Tuple[str, str], int] = defaultdict(int) - ins: Dict[str, int] = defaultdict(int) - dels: Dict[str, int] = defaultdict(int) - - # `words` stores counts per word, as follows: - # corr, ref_sub, hyp_sub, ins, dels - words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0]) - num_corr = 0 - ERR = "*" - for ref, hyp in results: - ali = kaldialign.align(ref, hyp, ERR) - for ref_word, hyp_word in ali: - if ref_word == ERR: - ins[hyp_word] += 1 - words[hyp_word][3] += 1 - elif hyp_word == ERR: - dels[ref_word] += 1 - words[ref_word][4] += 1 - elif hyp_word != ref_word: - subs[(ref_word, hyp_word)] += 1 - words[ref_word][1] += 1 - words[hyp_word][2] += 1 - else: - words[ref_word][0] += 1 - num_corr += 1 - ref_len = sum([len(r) for r, _ in results]) - sub_errs = sum(subs.values()) - ins_errs = sum(ins.values()) - del_errs = sum(dels.values()) - tot_errs = sub_errs + ins_errs + del_errs - tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len) - - if enable_log: - logging.info( - f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} " - f"[{tot_errs} / {ref_len}, {ins_errs} ins, " - f"{del_errs} del, {sub_errs} sub ]" - ) - - print(f"%WER = {tot_err_rate}", file=f) - print( - f"Errors: {ins_errs} insertions, {del_errs} deletions, " - f"{sub_errs} substitutions, over {ref_len} reference " - f"words ({num_corr} correct)", - file=f, - ) - print( - "Search below for sections starting with PER-UTT DETAILS:, " - "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:", - file=f, - ) - - print("", file=f) - print("PER-UTT DETAILS: corr or (ref->hyp) ", file=f) - for ref, hyp in results: - ali = kaldialign.align(ref, hyp, ERR) - combine_successive_errors = True - if combine_successive_errors: - ali = [[[x], [y]] for x, y in ali] - for i in range(len(ali) - 1): - if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]: - ali[i + 1][0] = ali[i][0] + ali[i + 1][0] - ali[i + 1][1] = ali[i][1] + ali[i + 1][1] - ali[i] = [[], []] - ali = [ - [ - list(filter(lambda a: a != ERR, x)), - list(filter(lambda a: a != ERR, y)), - ] - for x, y in ali - ] - ali = list(filter(lambda x: x != [[], []], ali)) - ali = [ - [ - ERR if x == [] else " ".join(x), - ERR if y == [] else " ".join(y), - ] - for x, y in ali - ] - - print( - " ".join( - ( - ref_word - if ref_word == hyp_word - else f"({ref_word}->{hyp_word})" - for ref_word, hyp_word in ali - ) - ), - file=f, - ) - - print("", file=f) - print("SUBSTITUTIONS: count ref -> hyp", file=f) - - for count, (ref, hyp) in sorted( - [(v, k) for k, v in subs.items()], reverse=True - ): - print(f"{count} {ref} -> {hyp}", file=f) - - print("", file=f) - print("DELETIONS: count ref", file=f) - for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True): - print(f"{count} {ref}", file=f) - - print("", file=f) - print("INSERTIONS: count hyp", file=f) - for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True): - print(f"{count} {hyp}", file=f) - - print("", file=f) - print( - "PER-WORD STATS: word corr tot_errs count_in_ref count_in_hyp", file=f - ) - for _, word, counts in sorted( - [(sum(v[1:]), k, v) for k, v in words.items()], reverse=True - ): - (corr, ref_sub, hyp_sub, ins, dels) = counts - tot_errs = ref_sub + hyp_sub + ins + dels - ref_count = corr + ref_sub + dels - hyp_count = corr + hyp_sub + ins - - print(f"{word} {corr} {tot_errs} {ref_count} {hyp_count}", file=f) - return float(tot_err_rate) \ No newline at end of file diff --git a/triton/model_repo/greedy_search/1/model.py b/triton/model_repo/greedy_search/1/model.py deleted file mode 100755 index 8d8dc226b..000000000 --- a/triton/model_repo/greedy_search/1/model.py +++ /dev/null @@ -1,236 +0,0 @@ -import triton_python_backend_utils as pb_utils -import numpy as np - -import json - -import torch -import sentencepiece as spm - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - self.model_config = model_config = json.loads(args['model_config']) - self.max_batch_size = max(model_config["max_batch_size"], 1) - - # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") - # Convert Triton types to numpy types - self.out0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) - - # Get INPUT configuration - - encoder_config = pb_utils.get_input_config_by_name( - model_config, "encoder_out__0") - self.data_type = pb_utils.triton_string_to_numpy( - encoder_config['data_type']) - - self.encoder_dim = encoder_config['dims'][-1] - - - self.init_sentence_piece(self.model_config['parameters']) - - def init_sentence_piece(self, parameters): - for key,value in parameters.items(): - parameters[key] = value["string_value"] - self.context_size = int(parameters['context_size']) - sp = spm.SentencePieceProcessor() - sp.load(parameters['bpe_model']) - self.blank_id = sp.piece_to_id("") - self.unk_id = sp.piece_to_id("") - self.vocab_size = sp.get_piece_size() - self.sp = sp - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - # Every Python backend must iterate through list of requests and create - # an instance of pb_utils.InferenceResponse class for each of them. You - # should avoid storing any of the input Tensors in the class attributes - # as they will be overridden in subsequent inference requests. You can - # make a copy of the underlying NumPy array and store it if it is - # required. - - batch_encoder_out_list, batch_encoder_lens_list = [], [] - batchsize_lists = [] - total_seqs = 0 - encoder_max_len = 0 - - for request in requests: - # Perform inference on the request and append it to responses list... - in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out__0") - in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens__1") - batch_encoder_out_list.append(in_0.as_numpy()) - encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1]) - cur_b_lens = in_1.as_numpy() - batch_encoder_lens_list.append(cur_b_lens) - cur_batchsize = cur_b_lens.shape[0] - batchsize_lists.append(cur_batchsize) - total_seqs += cur_batchsize - - - encoder_out_array = np.zeros((total_seqs, encoder_max_len, self.encoder_dim), - dtype=self.data_type) - encoder_out_lens_array = np.zeros(total_seqs, dtype=np.int32) - st = 0 - for b in batchsize_lists: - t = batch_encoder_out_list.pop(0) - encoder_out_array[st:st + b, 0:t.shape[1]] = t - encoder_out_lens_array[st:st + b] = batch_encoder_lens_list.pop(0) - st += b - - encoder_out = torch.from_numpy(encoder_out_array) - - encoder_out_lens = torch.from_numpy(encoder_out_lens_array) - - packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence( - input=encoder_out, - lengths=encoder_out_lens.cpu(), - batch_first=True, - enforce_sorted=False - ) - pack_batch_size_list = packed_encoder_out.batch_sizes.tolist() - - hyps = [[self.blank_id] * self.context_size for _ in range(total_seqs)] - decoder_input = np.asarray(hyps,dtype=np.int32) - in_decoder_input_tensor = pb_utils.Tensor("decoder_input__0", decoder_input) - - inference_request = pb_utils.InferenceRequest( - model_name='decoder', - requested_output_names=['decoder_output__0'], - inputs=[in_decoder_input_tensor]) - - inference_response = inference_request.exec() - if inference_response.has_error(): - raise pb_utils.TritonModelException(inference_response.error().message()) - else: - # Extract the output tensors from the inference response. - decoder_out = pb_utils.get_output_tensor_by_name(inference_response, - 'decoder_output__0') - decoder_out = torch.utils.dlpack.from_dlpack(decoder_out.to_dlpack()).cpu().numpy() - #decoder_out = decoder_out.as_numpy() - - offset = 0 - - for batch_size in pack_batch_size_list: - - start = offset - end = offset + batch_size - current_encoder_out = packed_encoder_out.data[start:end] - current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1).cpu().numpy() - # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim) - - offset = end - - decoder_out = decoder_out[:batch_size] - - in_joiner_tensor_0 = pb_utils.Tensor("encoder_out__0", current_encoder_out) - in_joiner_tensor_1 = pb_utils.Tensor("decoder_out__1", np.expand_dims(decoder_out, axis=1)) - - inference_request = pb_utils.InferenceRequest( - model_name='joiner', - requested_output_names=['logit__0'], - inputs=[in_joiner_tensor_0, in_joiner_tensor_1]) - inference_response = inference_request.exec() - if inference_response.has_error(): - raise pb_utils.TritonModelException(inference_response.error().message()) - else: - # Extract the output tensors from the inference response. - logits = pb_utils.get_output_tensor_by_name(inference_response, - 'logit__0') - logits = torch.utils.dlpack.from_dlpack(logits.to_dlpack()).cpu().numpy() - - #logits = logits.as_numpy() - logits = torch.from_numpy(logits) - logits = logits.squeeze(1).squeeze(1) # (batch_size, vocab_size) - - assert logits.ndim == 2, logits.shape - y = logits.argmax(dim=1).tolist() - - emitted = False - for i, v in enumerate(y): - if v not in (self.blank_id, self.unk_id): - hyps[i].append(v) - emitted = True - if emitted: - # update decoder output - decoder_input = [h[-self.context_size:] for h in hyps[:batch_size]] - - decoder_input = np.asarray(decoder_input,dtype=np.int32) - - in_decoder_input_tensor = pb_utils.Tensor("decoder_input__0", decoder_input) - - inference_request = pb_utils.InferenceRequest( - model_name='decoder', - requested_output_names=['decoder_output__0'], - inputs=[in_decoder_input_tensor]) - - inference_response = inference_request.exec() - if inference_response.has_error(): - raise pb_utils.TritonModelException(inference_response.error().message()) - else: - # Extract the output tensors from the inference response. - decoder_out = pb_utils.get_output_tensor_by_name(inference_response, - 'decoder_output__0') - decoder_out = torch.utils.dlpack.from_dlpack(decoder_out.to_dlpack()).cpu().numpy() - #decoder_out = decoder_out.as_numpy() - - sorted_ans = [h[self.context_size:] for h in hyps] - ans = [] - unsorted_indices = packed_encoder_out.unsorted_indices.tolist() - for i in range(total_seqs): - ans.append(sorted_ans[unsorted_indices[i]]) - - results = [] - for hyp in self.sp.decode(ans): - results.append(hyp.split()) - - st = 0 - responses = [] - for b in batchsize_lists: - sents = np.array(results[st:st + b]) - out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) - inference_response = pb_utils.InferenceResponse(output_tensors=[out0]) - responses.append(inference_response) - st += b - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') \ No newline at end of file diff --git a/triton/model_repo/encoder/1/.gitkeep b/triton/model_repo_offline/decoder/1/.gitkeep old mode 100644 new mode 100755 similarity index 100% rename from triton/model_repo/encoder/1/.gitkeep rename to triton/model_repo_offline/decoder/1/.gitkeep diff --git a/triton/model_repo_offline/decoder/config.pbtxt.template b/triton/model_repo_offline/decoder/config.pbtxt.template new file mode 100755 index 000000000..b38522f65 --- /dev/null +++ b/triton/model_repo_offline/decoder/config.pbtxt.template @@ -0,0 +1,44 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "decoder" +backend: "onnxruntime" +default_model_filename: "decoder.onnx" + +max_batch_size: MAX_BATCH +input [ + { + name: "y" + data_type: TYPE_INT64 + dims: [DECODER_CONTEXT_SIZE] + } +] + +output [ + { + name: "decoder_out" + data_type: TYPE_FP32 + dims: [DECODER_DIM] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: DECODER_INSTANCE_NUM + kind: KIND_GPU + } +] diff --git a/triton/model_repo/joiner/1/.gitkeep b/triton/model_repo_offline/encoder/1/.gitkeep old mode 100644 new mode 100755 similarity index 100% rename from triton/model_repo/joiner/1/.gitkeep rename to triton/model_repo_offline/encoder/1/.gitkeep diff --git a/triton/model_repo/decoder/config.pbtxt b/triton/model_repo_offline/encoder/config.pbtxt.template similarity index 60% rename from triton/model_repo/decoder/config.pbtxt rename to triton/model_repo_offline/encoder/config.pbtxt.template index 510a9fca3..a2fe7e15e 100755 --- a/triton/model_repo/decoder/config.pbtxt +++ b/triton/model_repo_offline/encoder/config.pbtxt.template @@ -12,47 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: "decoder" -backend: "pytorch" -default_model_filename: "decoder_jit.pt" +name: "encoder" +backend: "onnxruntime" +default_model_filename: "encoder.onnx" -max_batch_size: 512 +max_batch_size: MAX_BATCH input [ { - name: "decoder_input__0" - data_type: TYPE_INT32 - dims: [2] # [context_size] + name: "x" + data_type: TYPE_FP32 + dims: [-1, 80] + }, + { + name: "x_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } } ] - output [ { - name: "decoder_output__0" + name: "encoder_out" data_type: TYPE_FP32 - dims: [1, 512] # [context_size, decoder_dim] + dims: [-1, ENCODER_DIM] + }, + { + name: "encoder_out_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } } ] dynamic_batching { - preferred_batch_size: [ 16, 32 ] } instance_group [ { - count: 1 + count: ENCODER_INSTANCE_NUM kind: KIND_GPU } ] - -parameters: { -key: "INFERENCE_MODE" - value: { - string_value:"true" - } -} -parameters: { -key: "DISABLE_OPTIMIZED_EXECUTION" - value: { - string_value:"true" - } -} \ No newline at end of file diff --git a/triton/model_repo_offline/feature_extractor/1/model.py b/triton/model_repo_offline/feature_extractor/1/model.py new file mode 100755 index 000000000..26bf9ddd5 --- /dev/null +++ b/triton/model_repo_offline/feature_extractor/1/model.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack +import torch +import numpy as np +import kaldifeat +import _kaldifeat +from typing import List +import json + +class Fbank(torch.nn.Module): + def __init__(self, opts): + super(Fbank, self).__init__() + self.fbank = kaldifeat.Fbank(opts) + + def forward(self, waves: List[torch.Tensor]): + return self.fbank(waves) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + if "GPU" in model_config["instance_group"][0]["kind"]: + self.device = "cuda" + else: + self.device = "cpu" + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "x") + # Convert Triton types to numpy types + output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + if output0_dtype == np.float32: + self.output0_dtype = torch.float32 + else: + self.output0_dtype = torch.float16 + + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "x_lens") + # Convert Triton types to numpy types + output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + if output1_dtype == np.int64: + self.output1_dtype = torch.int64 + else: + self.output1_dtype = torch.int32 + + params = self.model_config['parameters'] + opts = kaldifeat.FbankOptions() + opts.frame_opts.dither = 0 + opts.frame_opts.snip_edges = False + for li in params.items(): + key, value = li + value = value["string_value"] + if key == "num_mel_bins": + opts.mel_opts.num_bins = int(value) + elif key == "frame_shift_in_ms": + opts.frame_opts.frame_shift_ms = float(value) + elif key == "frame_length_in_ms": + opts.frame_opts.frame_length_ms = float(value) + elif key == "sample_rate": + opts.frame_opts.samp_freq = int(value) + opts.device = torch.device(self.device) + self.opts = opts + self.feature_extractor = Fbank(self.opts) + self.feature_size = opts.mel_opts.num_bins + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + batch_count = [] + total_waves = [] + batch_len = [] + responses = [] + for request in requests: + input0 = pb_utils.get_input_tensor_by_name(request, "wav") + input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens") + + cur_b_wav = input0.as_numpy() + cur_b_wav_lens = input1.as_numpy() # b x 1 + cur_batch = cur_b_wav.shape[0] + cur_len = cur_b_wav.shape[1] + batch_count.append(cur_batch) + batch_len.append(cur_len) + for wav, wav_len in zip(cur_b_wav, cur_b_wav_lens): + wav_len = wav_len[0] + wav = torch.tensor(wav[0:wav_len], dtype=torch.float32, + device=self.device) + total_waves.append(wav) + + features = self.feature_extractor(total_waves) + for b, l in zip(batch_count, batch_len): + expect_feat_len = _kaldifeat.num_frames(l, self.opts.frame_opts) + speech = torch.zeros((b, expect_feat_len, self.feature_size), + dtype=self.output0_dtype, device=self.device) + speech_lengths = torch.zeros((b, 1), dtype=self.output1_dtype, device=self.device) + for i in range(b): + f = features.pop(0) + f_l = f.shape[0] + speech[i, 0: f_l, :] = f.to(self.output0_dtype) + speech_lengths[i][0] = f_l + speech = speech.cpu() + speech_lengths = speech_lengths.cpu() + out0 = pb_utils.Tensor.from_dlpack("x", to_dlpack(speech)) + out1 = pb_utils.Tensor.from_dlpack("x_lens", + to_dlpack(speech_lengths)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1]) + responses.append(inference_response) + return responses diff --git a/triton/model_repo_offline/feature_extractor/config.pbtxt.template b/triton/model_repo_offline/feature_extractor/config.pbtxt.template new file mode 100755 index 000000000..4a1f557d6 --- /dev/null +++ b/triton/model_repo_offline/feature_extractor/config.pbtxt.template @@ -0,0 +1,72 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "feature_extractor" +backend: "python" +max_batch_size: MAX_BATCH + +parameters [ + { + key: "num_mel_bins", + value: { string_value: "80"} + }, + { + key: "frame_shift_in_ms" + value: { string_value: "10"} + }, + { + key: "frame_length_in_ms" + value: { string_value: "25"} + }, + { + key: "sample_rate" + value: { string_value: "16000"} + } + +] + +input [ + { + name: "wav" + data_type: TYPE_FP32 + dims: [-1] + }, + { + name: "wav_lens" + data_type: TYPE_INT32 + dims: [1] + } +] + +output [ + { + name: "x" + data_type: TYPE_FP32 + dims: [-1, 80] + }, + { + name: "x_lens" + data_type: TYPE_INT64 + dims: [1] + } +] + +dynamic_batching { + } +instance_group [ + { + count: FEATURE_EXTRACTOR_INSTANCE_NUM + kind: KIND_GPU + } +] diff --git a/triton/model_repo_offline/joiner/1/.gitkeep b/triton/model_repo_offline/joiner/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo/joiner/config.pbtxt b/triton/model_repo_offline/joiner/config.pbtxt.template similarity index 65% rename from triton/model_repo/joiner/config.pbtxt rename to triton/model_repo_offline/joiner/config.pbtxt.template index e55ec401c..cac15b65c 100755 --- a/triton/model_repo/joiner/config.pbtxt +++ b/triton/model_repo_offline/joiner/config.pbtxt.template @@ -13,51 +13,37 @@ # limitations under the License. name: "joiner" -backend: "pytorch" -default_model_filename: "joiner_jit.pt" +backend: "onnxruntime" +default_model_filename: "joiner.onnx" -max_batch_size: 512 +max_batch_size: MAX_BATCH input [ { - name: "encoder_out__0" + name: "encoder_out" data_type: TYPE_FP32 - dims: [1,1,512] + dims: [ENCODER_DIM] }, { - name: "decoder_out__1" + name: "decoder_out" data_type: TYPE_FP32 - dims: [1,1,512] + dims: [DECODER_DIM] } ] output [ { - name: "logit__0" + name: "logit" data_type: TYPE_FP32 - dims: [1,1,500] + dims: [VOCAB_SIZE] } ] dynamic_batching { - preferred_batch_size: [ 16, 32 ] } instance_group [ { - count: 1 + count: JOINER_INSTANCE_NUM kind: KIND_GPU } ] - -parameters: { -key: "INFERENCE_MODE" - value: { - string_value:"true" - } -} -parameters: { -key: "DISABLE_OPTIMIZED_EXECUTION" - value: { - string_value:"true" - } -} \ No newline at end of file diff --git a/triton/model_repo_offline/scorer/1/model.py b/triton/model_repo_offline/scorer/1/model.py new file mode 100755 index 000000000..7a2cff501 --- /dev/null +++ b/triton/model_repo_offline/scorer/1/model.py @@ -0,0 +1,348 @@ +# -*- coding: utf-8 -*- +import triton_python_backend_utils as pb_utils +import numpy as np +import json +import torch +from torch.utils.dlpack import from_dlpack, to_dlpack +import sentencepiece as spm +from icefall.lexicon import Lexicon +from typing import List, Union +import k2 + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + # Convert Triton types to numpy types + self.out0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + model_instance_kind = args['model_instance_kind'] + model_instance_device_id = args['model_instance_device_id'] + if model_instance_kind == 'GPU': + self.device = f'cuda:{model_instance_device_id}' + else: + self.device= 'cpu' + + # Get INPUT configuration + encoder_config = pb_utils.get_input_config_by_name( + model_config, "encoder_out") + self.data_type = pb_utils.triton_string_to_numpy( + encoder_config['data_type']) + if self.data_type == np.float32: + self.torch_dtype = torch.float32 + else: + assert self.data_type == np.float16 + self.torch_dtype = torch.float16 + + self.encoder_dim = encoder_config['dims'][-1] + + + self.init_parameters(self.model_config['parameters']) + + def init_parameters(self, parameters): + for key,value in parameters.items(): + parameters[key] = value["string_value"] + self.context_size = int(parameters['context_size']) + self.decoding_method = parameters['decoding_method'] + if 'bpe' in parameters['tokenizer_file']: + sp = spm.SentencePieceProcessor() + sp.load(parameters['tokenizer_file']) + self.blank_id = sp.piece_to_id("") + self.unk_id = sp.piece_to_id("") + self.vocab_size = sp.get_piece_size() + self.tokenizer = sp + else: + assert 'char' in parameters['tokenizer_file'] + lexicon = Lexicon(parameters['tokenizer_file']) + self.unk_id = lexicon.token_table[""] + self.blank_id = lexicon.token_table[""] + self.vocab_size = max(lexicon.tokens) + 1 + self.tokenizer = lexicon + if self.decoding_method == 'fast_beam_search': + # parameters for fast beam search + self.beam = int(self.model_config['parameters']['beam']) + self.max_contexts = int(self.model_config['parameters']['max_contexts']) + self.max_states = int(self.model_config['parameters']['max_states']) + self.temperature = float(self.model_config['parameters']['temperature']) + # Support fast beam search one best currently + self.decoding_graph = k2.trivial_graph( + self.vocab_size - 1, device=self.device + ) + + def forward_joiner(self, cur_encoder_out, decoder_out): + in_joiner_tensor_0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(cur_encoder_out)) + in_joiner_tensor_1 = pb_utils.Tensor.from_dlpack("decoder_out", to_dlpack(decoder_out.squeeze(1))) + + inference_request = pb_utils.InferenceRequest( + model_name='joiner', + requested_output_names=['logit'], + inputs=[in_joiner_tensor_0, in_joiner_tensor_1]) + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + logits = pb_utils.get_output_tensor_by_name(inference_response, + 'logit') + logits = torch.utils.dlpack.from_dlpack(logits.to_dlpack()).cpu() + assert len(logits.shape) == 2, logits.shape + return logits + + def forward_decoder(self, hyps): + decoder_input = np.asarray(hyps,dtype=np.int64) + + in_decoder_input_tensor = pb_utils.Tensor("y", decoder_input) + + inference_request = pb_utils.InferenceRequest( + model_name='decoder', + requested_output_names=['decoder_out'], + inputs=[in_decoder_input_tensor]) + + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + decoder_out = pb_utils.get_output_tensor_by_name(inference_response, + 'decoder_out') + decoder_out = from_dlpack(decoder_out.to_dlpack()) + return decoder_out + + + def greedy_search(self, encoder_out, encoder_out_lens): + + packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence( + input=encoder_out, + lengths=encoder_out_lens.cpu(), + batch_first=True, + enforce_sorted=False + ) + + pack_batch_size_list = packed_encoder_out.batch_sizes.tolist() + + hyps = [[self.blank_id] * self.context_size for _ in range(encoder_out.shape[0])] + contexts = [h[-self.context_size:] for h in hyps] + decoder_out = self.forward_decoder(contexts) + + offset = 0 + for batch_size in pack_batch_size_list: + start = offset + end = offset + batch_size + current_encoder_out = packed_encoder_out.data[start:end] + + offset = end + + decoder_out = decoder_out[:batch_size] + + logits = self.forward_joiner(current_encoder_out, decoder_out) + + assert logits.ndim == 2, logits.shape + y = logits.argmax(dim=1).tolist() + + emitted = False + for i, v in enumerate(y): + if v not in (self.blank_id, self.unk_id): + hyps[i].append(v) + emitted = True + if emitted: + hyp = hyps[:batch_size] + contexts = [h[-self.context_size:] for h in hyp] + decoder_out = self.forward_decoder(contexts) + + + sorted_ans = [h[self.context_size:] for h in hyps] + + ans = [] + unsorted_indices = packed_encoder_out.unsorted_indices.tolist() + for i in range(encoder_out.shape[0]): + ans.append(sorted_ans[unsorted_indices[i]]) + + return ans + + # From k2 utils.py + def get_texts(self, + best_paths: k2.Fsa, return_ragged: bool = False + ) -> Union[List[List[int]], k2.RaggedTensor]: + """Extract the texts (as word IDs) from the best-path FSAs. + Args: + best_paths: + A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. + containing multiple FSAs, which is expected to be the result + of k2.shortest_path (otherwise the returned values won't + be meaningful). + return_ragged: + True to return a ragged tensor with two axes [utt][word_id]. + False to return a list-of-list word IDs. + Returns: + Returns a list of lists of int, containing the label sequences we + decoded. + """ + if isinstance(best_paths.aux_labels, k2.RaggedTensor): + # remove 0's and -1's. + aux_labels = best_paths.aux_labels.remove_values_leq(0) + # TODO: change arcs.shape() to arcs.shape + aux_shape = best_paths.arcs.shape().compose(aux_labels.shape) + + # remove the states and arcs axes. + aux_shape = aux_shape.remove_axis(1) + aux_shape = aux_shape.remove_axis(1) + aux_labels = k2.RaggedTensor(aux_shape, aux_labels.values) + else: + # remove axis corresponding to states. + aux_shape = best_paths.arcs.shape().remove_axis(1) + aux_labels = k2.RaggedTensor(aux_shape, best_paths.aux_labels) + # remove 0's and -1's. + aux_labels = aux_labels.remove_values_leq(0) + + assert aux_labels.num_axes == 2 + if return_ragged: + return aux_labels + else: + return aux_labels.tolist() + + def fast_beam_search(self, encoder_out, encoder_out_lens): + B, T, C = encoder_out.shape + + config = k2.RnntDecodingConfig( + vocab_size=self.vocab_size, + decoder_history_len=self.context_size, + beam=self.beam, + max_contexts=self.max_contexts, + max_states=self.max_states, + ) + individual_streams = [] + for i in range(B): + individual_streams.append(k2.RnntDecodingStream(self.decoding_graph)) + decoding_streams = k2.RnntDecodingStreams(individual_streams, config) + + for t in range(T): + shape, contexts = decoding_streams.get_contexts() + contexts = contexts.to(torch.int64) + + decoder_out = self.forward_decoder(contexts) + + cur_encoder_out = torch.index_select( + encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64) + ) + + logits = self.forward_joiner(cur_encoder_out.squeeze(1), + decoder_out) + + logits = logits.squeeze(1).squeeze(1).float() + log_probs = (logits / self.temperature).log_softmax(dim=-1) + decoding_streams.advance(log_probs) + decoding_streams.terminate_and_flush_to_streams() + lattice = decoding_streams.format_output(encoder_out_lens.tolist()) + + best_path = k2.shortest_path(lattice, use_double_scores=True) + hyps_list = self.get_texts(best_path) + + return hyps_list + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + # Every Python backend must iterate through list of requests and create + # an instance of pb_utils.InferenceResponse class for each of them. You + # should avoid storing any of the input Tensors in the class attributes + # as they will be overridden in subsequent inference requests. You can + # make a copy of the underlying NumPy array and store it if it is + # required. + + batch_encoder_out_list, batch_encoder_lens_list = [], [] + batchsize_lists = [] + total_seqs = 0 + encoder_max_len = 0 + + for request in requests: + # Perform inference on the request and append it to responses list... + in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out") + in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens") + assert not in_0.is_cpu() + batch_encoder_out_list.append(from_dlpack(in_0.to_dlpack())) + encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1]) + cur_b_lens = from_dlpack(in_1.to_dlpack()) + batch_encoder_lens_list.append(cur_b_lens) + cur_batchsize = cur_b_lens.shape[0] + batchsize_lists.append(cur_batchsize) + total_seqs += cur_batchsize + + encoder_out = torch.zeros((total_seqs, encoder_max_len, self.encoder_dim), + dtype=self.torch_dtype, device=self.device) + encoder_out_lens = torch.zeros(total_seqs, dtype=torch.int64) + st = 0 + + for b in batchsize_lists: + t = batch_encoder_out_list.pop(0) + encoder_out[st:st + b, 0:t.shape[1]] = t + encoder_out_lens[st:st + b] = batch_encoder_lens_list.pop(0) + st += b + + if self.decoding_method == 'greedy_search': + ans = self.greedy_search(encoder_out, encoder_out_lens) + elif self.decoding_method == 'fast_beam_search': + ans = self.fast_beam_search(encoder_out, encoder_out_lens) + else: + raise NotImplementedError + + results = [] + if hasattr(self.tokenizer, 'token_table'): + for i in range(len(ans)): + results.append([self.tokenizer.token_table[idx] for idx in ans[i]]) + else: + for hyp in self.tokenizer.decode(ans): + results.append(hyp.split()) + st = 0 + responses = [] + for b in batchsize_lists: + sents = np.array(results[st:st + b]) + out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out0]) + responses.append(inference_response) + st += b + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/triton/model_repo_offline/scorer/config.pbtxt.template b/triton/model_repo_offline/scorer/config.pbtxt.template new file mode 100755 index 000000000..9e4d2da97 --- /dev/null +++ b/triton/model_repo_offline/scorer/config.pbtxt.template @@ -0,0 +1,84 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "scorer" +backend: "python" +max_batch_size: MAX_BATCH + +parameters [ + { + key: "context_size", + value: { string_value: "DECODER_CONTEXT_SIZE"} + }, + { + key: "tokenizer_file", + value: { string_value: "TOKENIZER_FILE"} + }, + { + key: "FORCE_CPU_ONLY_INPUT_TENSORS", + value: {string_value:"no"} + }, + { + key: "decoding_method", + value: { string_value: "greedy_search"} + }, + { + key: "beam", + value: { string_value: "4"} + }, + { + key: "max_contexts", + value: { string_value: "4"} + }, + { + key: "max_states", + value: { string_value: "32"} + }, + { + key: "temperature", + value: { string_value: "1.0"} + } +] + + +input [ + { + name: "encoder_out" + data_type: TYPE_FP32 + dims: [-1, ENCODER_DIM] + }, + { + name: "encoder_out_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [1] + } +] + +dynamic_batching { + } +instance_group [ + { + count: SCORER_INSTANCE_NUM + kind: KIND_CPU + } + ] diff --git a/triton/model_repo_offline/transducer/1/.gitkeep b/triton/model_repo_offline/transducer/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_offline/transducer/config.pbtxt.template b/triton/model_repo_offline/transducer/config.pbtxt.template new file mode 120000 index 000000000..1b4226572 --- /dev/null +++ b/triton/model_repo_offline/transducer/config.pbtxt.template @@ -0,0 +1 @@ +../../model_repo_streaming/transducer/config.pbtxt.template \ No newline at end of file diff --git a/triton/model_repo_streaming/decoder/1/.gitkeep b/triton/model_repo_streaming/decoder/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming/decoder/config.pbtxt.template b/triton/model_repo_streaming/decoder/config.pbtxt.template new file mode 100755 index 000000000..d7a28fbb1 --- /dev/null +++ b/triton/model_repo_streaming/decoder/config.pbtxt.template @@ -0,0 +1,44 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "decoder" +backend: "onnxruntime" +default_model_filename: "decoder.onnx" + +max_batch_size: MAX_BATCH +input [ + { + name: "y" + data_type: TYPE_INT64 + dims: [DECODER_CONTEXT_SIZE] + } +] + +output [ + { + name: "decoder_out" + data_type: TYPE_FP16 + dims: [1, DECODER_DIM] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: DECODER_INSTANCE_NUM + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/model_repo_streaming/encoder/1/.gitignore b/triton/model_repo_streaming/encoder/1/.gitignore new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming/encoder/config.pbtxt.template b/triton/model_repo_streaming/encoder/config.pbtxt.template new file mode 100755 index 000000000..eb19b3886 --- /dev/null +++ b/triton/model_repo_streaming/encoder/config.pbtxt.template @@ -0,0 +1,99 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "encoder" +backend: "onnxruntime" +default_model_filename: "encoder.onnx" + +max_batch_size: MAX_BATCH + +sequence_batching{ + max_sequence_idle_microseconds: 5000000 + oldest { + max_candidate_sequences: 1024 + max_queue_delay_microseconds: 5000 + } + control_input [ + ] + state [ + { + input_name: "attn_cache" + output_name: "next_attn_cache" + data_type: TYPE_FP16 + dims: [ ENCODER_LEFT_CONTEXT, ENCODER_LAYERS, ENCODER_DIM ] + initial_state: { + data_type: TYPE_FP16 + dims: [ ENCODER_LEFT_CONTEXT, ENCODER_LAYERS, ENCODER_DIM ] + zero_data: true + name: "initial state" + } + }, + { + input_name: "cnn_cache" + output_name: "next_cnn_cache" + data_type: TYPE_FP16 + dims: [ CNN_MODULE_KERNEL_MINUS_ONE, ENCODER_LAYERS, ENCODER_DIM ] + initial_state: { + data_type: TYPE_FP16 + dims: [CNN_MODULE_KERNEL_MINUS_ONE, ENCODER_LAYERS, ENCODER_DIM] + zero_data: true + name: "initial state" + } + }, + { + input_name: "processed_lens" + output_name: "next_processed_lens" + data_type: TYPE_INT64 + dims: [ 1 ] + initial_state: { + data_type: TYPE_INT64 + dims: [ 1 ] + zero_data: true + name: "initial state" + } + } + ] +} +input [ + { + name: "speech" + data_type: TYPE_FP16 + dims: [-1, 80] + }, + { + name: "speech_lengths" + data_type: TYPE_INT64 + dims: [ 1 ] + reshape: { shape: [] } + } +] +output [ + { + name: "encoder_out" + data_type: TYPE_FP16 + dims: [-1, ENCODER_DIM] + }, + { + name: "encoder_out_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [] } + } +] +instance_group [ + { + count: ENCODER_INSTANCE_NUM + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/model_repo_streaming/feature_extractor/1/model.py b/triton/model_repo_streaming/feature_extractor/1/model.py new file mode 100755 index 000000000..c84845206 --- /dev/null +++ b/triton/model_repo_streaming/feature_extractor/1/model.py @@ -0,0 +1,266 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack +import torch +import kaldifeat +import _kaldifeat +from typing import List +import json +import numpy as np + +class Fbank(torch.nn.Module): + def __init__(self, opts): + super(Fbank, self).__init__() + self.fbank = kaldifeat.Fbank(opts) + + def forward(self, waves: List[torch.Tensor]): + return self.fbank(waves) + +class Feat(object): + def __init__(self, seqid, offset_ms, sample_rate, + first_chunk_sz, frame_stride, device='cpu'): + self.seqid = seqid + self.sample_rate = sample_rate + self.wav = torch.tensor([], device=device) + self.offset = int(offset_ms / 1000 * sample_rate) + self.frames = None + self.frame_stride = int(frame_stride) + self.first_chunk_sz = first_chunk_sz + self.device = device + + def add_wavs(self, wav: torch.tensor): + if len(self.wav) == 0 and len(wav) < self.first_chunk_sz: + raise Exception("Invalid first chunk size", len(wav), self.first_chunk_sz) + wav = wav.to(self.device) + self.wav = torch.cat([self.wav, wav], axis=0) + + def get_seg_wav(self): + seg = self.wav[:] + self.wav = self.wav[-self.offset:] + return seg + + def add_frames(self, frames: torch.tensor): + """ + frames: seq_len x feat_sz + """ + if self.frames is None: + self.frames = frames + else: + self.frames = torch.cat([self.frames, frames], axis=0) + + def get_frames(self, num_frames: int): + seg = self.frames[0: num_frames] + self.frames = self.frames[self.frame_stride:] + return seg + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + if "GPU" in model_config["instance_group"][0]["kind"]: + self.device = "cuda" + else: + self.device = "cpu" + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "x") + # Convert Triton types to numpy types + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + if self.output0_dtype == np.float32: + self.dtype = torch.float32 + else: + self.dtype = torch.float16 + + self.feature_size = output0_config['dims'][-1] + self.decoding_window = output0_config['dims'][-2] + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "x_lens") + # Convert Triton types to numpy types + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + feat_opt = self.parse_model_params(model_config["parameters"]) + + opts = kaldifeat.FbankOptions() + opts.frame_opts.dither = 0 + opts.mel_opts.num_bins = self.feature_size + frame_length_ms = feat_opt["frame_length_ms"] + frame_shift_ms = feat_opt["frame_shift_ms"] + opts.frame_opts.frame_length_ms = frame_length_ms + opts.frame_opts.frame_shift_ms = frame_shift_ms + opts.frame_opts.samp_freq = feat_opt["sample_rate"] + opts.device = torch.device(self.device) + self.opts = opts + self.feature_extractor = Fbank(self.opts) + self.seq_feat = {} + chunk_size_s = feat_opt["chunk_size_s"] + sample_rate = feat_opt["sample_rate"] + self.chunk_size = int(chunk_size_s * sample_rate) + self.frame_stride = (chunk_size_s * 1000) // frame_shift_ms + + first_chunk_size = int(self.chunk_size) + cur_frames = _kaldifeat.num_frames(first_chunk_size, opts.frame_opts) + while cur_frames < self.decoding_window: + first_chunk_size += frame_shift_ms * sample_rate // 1000 + cur_frames = _kaldifeat.num_frames(first_chunk_size, opts.frame_opts) + # self.pad_silence = first_chunk_size - self.chunk_size + self.first_chunk_size = first_chunk_size + self.offset_ms = self.get_offset(frame_length_ms, frame_shift_ms) + self.sample_rate = sample_rate + self.min_seg = frame_length_ms * sample_rate // 1000 + print("MIN SEG IS", self.min_seg) + + def get_offset(self, frame_length_ms, frame_shift_ms): + offset_ms = 0 + while offset_ms + frame_shift_ms < frame_length_ms: + offset_ms += frame_shift_ms + return offset_ms + + def parse_model_params(self, model_params): + model_p = { + "frame_length_ms": 25, + "frame_shift_ms": 10, + "sample_rate": 16000, + "decode_chunk_size": 0.64} + # get parameter configurations + for li in model_params.items(): + key, value = li + true_value = value["string_value"] + if key not in model_p: + continue + key_type = type(model_p[key]) + if key_type == type(None): + model_p[key] = true_value + else: + model_p[key] = key_type(true_value) + # convert frames after 4x subsampling into seconds + model_p["chunk_size_s"] = model_p["decode_chunk_size"] * 4 * 10 /1000 + return model_p + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + total_waves = [] + responses = [] + batch_seqid = [] + end_seqid = {} + for request in requests: + input0 = pb_utils.get_input_tensor_by_name(request, "wav") + # wavs = input0.as_numpy()[0] + wavs = from_dlpack(input0.to_dlpack())[0] + + input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens") + # wav_lens = input1.as_numpy()[0][0] + wav_lens = from_dlpack(input1.to_dlpack())[0] + + in_start = pb_utils.get_input_tensor_by_name(request, "START") + start = in_start.as_numpy()[0][0] + in_ready = pb_utils.get_input_tensor_by_name(request, "READY") + ready = in_ready.as_numpy()[0][0] + in_corrid = pb_utils.get_input_tensor_by_name(request, "CORRID") + corrid = in_corrid.as_numpy()[0][0] + in_end = pb_utils.get_input_tensor_by_name(request, "END") + end = in_end.as_numpy()[0][0] + + if start: + self.seq_feat[corrid] = Feat(corrid, self.offset_ms, + self.sample_rate, + self.first_chunk_size, + self.frame_stride, + self.device) + if ready: + self.seq_feat[corrid].add_wavs(wavs[0:wav_lens]) + + batch_seqid.append(corrid) + if end: + end_seqid[corrid] = 1 + + # if not start + # check chunk ms size + + # wav = self.seq_feat[corrid].get_seg_wav() * 32768 + wav = self.seq_feat[corrid].get_seg_wav() + if len(wav) < self.min_seg: + temp = torch.zeros(self.min_seg, dtype=torch.float32, + device=self.device) + temp[0:len(wav)] = wav[:] + wav = temp + total_waves.append(wav) + + features = self.feature_extractor(total_waves) + + batch_size = len(batch_seqid) + batch_speech = torch.zeros((batch_size, self.decoding_window, + self.feature_size), dtype=self.dtype) + batch_speech_lens = torch.zeros((batch_size, 1), dtype=torch.int64) + i = 0 + for corrid, frames in zip(batch_seqid, features): + self.seq_feat[corrid].add_frames(frames) + r_frames = self.seq_feat[corrid].get_frames(self.decoding_window) + speech = batch_speech[i: i + 1] + speech_lengths = batch_speech_lens[i: i + 1] + i += 1 + speech_lengths[0] = r_frames.size(0) + speech[0][0:r_frames.size(0)] = r_frames.to(speech.device) + # out_tensor0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech)) + # out_tensor1 = pb_utils.Tensor.from_dlpack("speech_lengths", + # to_dlpack(speech_lengths)) + out_tensor0 = pb_utils.Tensor("x", speech.numpy()) + out_tensor1 = pb_utils.Tensor("x_lens", speech_lengths.numpy().astype(np.int64)) + output_tensors = [out_tensor0, out_tensor1] + response = pb_utils.InferenceResponse(output_tensors=output_tensors) + responses.append(response) + if corrid in end_seqid: + del self.seq_feat[corrid] + return responses + + def finalize(self): + print("Remove feature extractor!") diff --git a/triton/model_repo_streaming/feature_extractor/config.pbtxt.template b/triton/model_repo_streaming/feature_extractor/config.pbtxt.template new file mode 100755 index 000000000..7bf74d79a --- /dev/null +++ b/triton/model_repo_streaming/feature_extractor/config.pbtxt.template @@ -0,0 +1,110 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "feature_extractor" +backend: "python" +max_batch_size: MAX_BATCH + +parameters [ + { + key: "frame_length_ms", + value: { string_value: "25" } + }, + { + key: "frame_shift_ms" + value: { string_value: "10" } + }, + { + key: "sample_rate" + value: { string_value: "16000" } + }, + { + key: "decode_chunk_size", + value: { string_value: "DECODE_CHUNK_SIZE" } + } +] +sequence_batching{ + max_sequence_idle_microseconds: 5000000 + oldest { + max_candidate_sequences: 512 + } + control_input [ + { + name: "START", + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [0, 1] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [0, 1] + } + ] + }, + { + name: "CORRID", + control [ + { + kind: CONTROL_SEQUENCE_CORRID + data_type: TYPE_UINT64 + } + ] + }, + { + name: "END", + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [0, 1] + } + ] + } + ] +} +input [ + { + name: "wav" + data_type: TYPE_FP16 + dims: [-1] + }, + { + name: "wav_lens" + data_type: TYPE_INT32 + dims: [1] + } +] +output [ + { + name: "x" + data_type: TYPE_FP16 + dims: [DECODE_WINDOW_SIZE, 80] + }, + { + name: "x_lens" + data_type: TYPE_INT64 + dims: [1] + } +] +instance_group [ + { + count: FEATURE_EXTRACTOR_INSTANCE_NUM + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/model_repo_streaming/joiner/1/.gitkeep b/triton/model_repo_streaming/joiner/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming/joiner/config.pbtxt.template b/triton/model_repo_streaming/joiner/config.pbtxt.template new file mode 100755 index 000000000..883f90f53 --- /dev/null +++ b/triton/model_repo_streaming/joiner/config.pbtxt.template @@ -0,0 +1,49 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "joiner" +backend: "onnxruntime" +default_model_filename: "joiner.onnx" + +max_batch_size: MAX_BATCH +input [ + { + name: "encoder_out" + data_type: TYPE_FP16 + dims: [ENCODER_DIM] + }, + { + name: "decoder_out" + data_type: TYPE_FP16 + dims: [DECODER_DIM] + } +] + +output [ + { + name: "logit" + data_type: TYPE_FP16 + dims: [VOCAB_SIZE] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: JOINER_INSTANCE_NUM + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/model_repo_streaming/scorer/1/model.py b/triton/model_repo_streaming/scorer/1/model.py new file mode 100755 index 000000000..c2d4b10f1 --- /dev/null +++ b/triton/model_repo_streaming/scorer/1/model.py @@ -0,0 +1,375 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import triton_python_backend_utils as pb_utils +import numpy as np +import json +import torch +import sentencepiece as spm +from typing import Union, List + +from icefall.lexicon import Lexicon +import k2 + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + # Convert Triton types to numpy types + self.out0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + # Get INPUT configuration + if "GPU" in args['model_instance_kind']: + self.device = f"cuda:{args['model_instance_device_id']}" + else: + self.device = "cpu" + + encoder_config = pb_utils.get_input_config_by_name( + model_config, "encoder_out") + self.data_type = pb_utils.triton_string_to_numpy( + encoder_config['data_type']) + + self.encoder_dim = encoder_config['dims'][-1] + + + self.init_sentence_piece(self.model_config['parameters']) + + self.decoding_method = self.model_config['parameters']['decoding_method'] + # parameters for fast beam search + if self.decoding_method == 'fast_beam_search': + self.temperature = float(self.model_config['parameters']['temperature']) + + self.beam = int(self.model_config['parameters']['beam']) + self.max_contexts = int(self.model_config['parameters']['max_contexts']) + self.max_states = int(self.model_config['parameters']['max_states']) + + self.fast_beam_config = k2.RnntDecodingConfig( + vocab_size=self.vocab_size, + decoder_history_len=self.context_size, + beam=self.beam, + max_contexts=self.max_contexts, + max_states=self.max_states, + ) + + self.decoding_graph = k2.trivial_graph( + self.vocab_size - 1, device=self.device + ) + # use to record every sequence state + self.seq_states = {} + print("Finish Init") + + def init_sentence_piece(self, parameters): + for key,value in parameters.items(): + parameters[key] = value["string_value"] + self.context_size = int(parameters['context_size']) + if 'bpe' in parameters['tokenizer_file']: + sp = spm.SentencePieceProcessor() + sp.load(parameters['tokenizer_file']) + self.blank_id = sp.piece_to_id("") + self.unk_id = sp.piece_to_id("") + self.vocab_size = sp.get_piece_size() + self.tokenizer = sp + else: + assert 'char' in parameters['tokenizer_file'] + lexicon = Lexicon(parameters['tokenizer_file']) + self.unk_id = lexicon.token_table[""] + self.blank_id = lexicon.token_table[""] + self.vocab_size = max(lexicon.tokens) + 1 + self.tokenizer = lexicon + + + def forward_joiner(self, cur_encoder_out, decoder_out): + in_joiner_tensor_0 = pb_utils.Tensor("encoder_out", cur_encoder_out.cpu().numpy()) + in_joiner_tensor_1 = pb_utils.Tensor("decoder_out", decoder_out.cpu().numpy()) + + inference_request = pb_utils.InferenceRequest( + model_name='joiner', + requested_output_names=['logit'], + inputs=[in_joiner_tensor_0, in_joiner_tensor_1]) + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + logits = pb_utils.get_output_tensor_by_name(inference_response, + 'logit') + logits = torch.utils.dlpack.from_dlpack(logits.to_dlpack()).cpu() + assert len(logits.shape) == 2, logits.shape + return logits + + + def forward_decoder(self,contexts): + decoder_input = np.asarray(contexts, dtype=np.int64) + in_decoder_input_tensor = pb_utils.Tensor("y", decoder_input) + + inference_request = pb_utils.InferenceRequest( + model_name='decoder', + requested_output_names=['decoder_out'], + inputs=[in_decoder_input_tensor]) + + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + decoder_out = pb_utils.get_output_tensor_by_name(inference_response, + 'decoder_out') + decoder_out = torch.utils.dlpack.from_dlpack(decoder_out.to_dlpack()).cpu() + assert len(decoder_out.shape)==3, decoder_out.shape + decoder_out = decoder_out.squeeze(1) + return decoder_out + + def greedy_search(self, encoder_out, encoder_out_lens, hyps_list): + emitted = False + # add blank_id as prefix + hyps_list = [[self.blank_id] * self.context_size + h for h in hyps_list] + contexts = [h[-self.context_size:] for h in hyps_list] + decoder_out = self.forward_decoder(contexts) + assert encoder_out.shape[0] == decoder_out.shape[0] + for t in range(encoder_out.shape[1]): + if emitted: + contexts = [h[-self.context_size:] for h in hyps_list] + decoder_out = self.forward_decoder(contexts) + + cur_encoder_out = encoder_out[:,t] + logits = self.forward_joiner(cur_encoder_out, decoder_out) + + assert logits.ndim == 2, logits.shape + y = logits.argmax(dim=1).tolist() + + for i, v in enumerate(y): + if v not in (self.blank_id, self.unk_id): + hyps_list[i].append(v) + emitted = True + # remove prefix blank_id + hyps_list = [h[self.context_size:] for h in hyps_list] + return hyps_list + + # From k2 utils.py + def get_texts(self, + best_paths: k2.Fsa, return_ragged: bool = False + ) -> Union[List[List[int]], k2.RaggedTensor]: + """Extract the texts (as word IDs) from the best-path FSAs. + Args: + best_paths: + A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. + containing multiple FSAs, which is expected to be the result + of k2.shortest_path (otherwise the returned values won't + be meaningful). + return_ragged: + True to return a ragged tensor with two axes [utt][word_id]. + False to return a list-of-list word IDs. + Returns: + Returns a list of lists of int, containing the label sequences we + decoded. + """ + if isinstance(best_paths.aux_labels, k2.RaggedTensor): + # remove 0's and -1's. + aux_labels = best_paths.aux_labels.remove_values_leq(0) + # TODO: change arcs.shape() to arcs.shape + aux_shape = best_paths.arcs.shape().compose(aux_labels.shape) + + # remove the states and arcs axes. + aux_shape = aux_shape.remove_axis(1) + aux_shape = aux_shape.remove_axis(1) + aux_labels = k2.RaggedTensor(aux_shape, aux_labels.values) + else: + # remove axis corresponding to states. + aux_shape = best_paths.arcs.shape().remove_axis(1) + aux_labels = k2.RaggedTensor(aux_shape, best_paths.aux_labels) + # remove 0's and -1's. + aux_labels = aux_labels.remove_values_leq(0) + + assert aux_labels.num_axes == 2 + if return_ragged: + return aux_labels + else: + return aux_labels.tolist() + + def fast_beam_search(self, encoder_out, encoder_out_lens, states_list): + streams_list = [state[0] for state in states_list] + processed_lens_list = [state[1] for state in states_list] + + decoding_streams = k2.RnntDecodingStreams(streams_list, self.fast_beam_config) + + for t in range(encoder_out.shape[1]): + shape, contexts = decoding_streams.get_contexts() + contexts = contexts.to(torch.int64) + + decoder_out = self.forward_decoder(contexts) + + cur_encoder_out = torch.index_select( + encoder_out[:, t, :], 0, shape.row_ids(1).to(torch.int64) + ) + + logits = self.forward_joiner(cur_encoder_out, + decoder_out) + + logits = logits.squeeze(1).squeeze(1).float() + log_probs = (logits / self.temperature).log_softmax(dim=-1) + decoding_streams.advance(log_probs) + decoding_streams.terminate_and_flush_to_streams() + lattice = decoding_streams.format_output(processed_lens_list) + + best_path = k2.shortest_path(lattice, use_double_scores=True) + hyps_list = self.get_texts(best_path) + + return hyps_list + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + # Every Python backend must iterate through list of requests and create + # an instance of pb_utils.InferenceResponse class for each of them. You + # should avoid storing any of the input Tensors in the class attributes + # as they will be overridden in subsequent inference requests. You can + # make a copy of the underlying NumPy array and store it if it is + # required. + batch_encoder_out_list, batch_encoder_lens_list = [], [] + batch_idx = 0 + encoder_max_len = 0 + + batch_idx2_corrid = {} + + states_list = [] + end_idx = set() + + for request in requests: + # Perform inference on the request and append it to responses list... + in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out") + in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens") + + # TODO: directly use torch tensor from_dlpack(in_0.to_dlpack()) + batch_encoder_out_list.append(in_0.as_numpy()) + # For streaming ASR, assert each request sent from client has batch size 1. + assert batch_encoder_out_list[-1].shape[0] == 1 + encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1]) + + cur_b_lens = in_1.as_numpy() + batch_encoder_lens_list.append(cur_b_lens) + + in_start = pb_utils.get_input_tensor_by_name(request, "START") + start = in_start.as_numpy()[0][0] + + in_ready = pb_utils.get_input_tensor_by_name(request, "READY") + ready = in_ready.as_numpy()[0][0] + + in_corrid = pb_utils.get_input_tensor_by_name(request, "CORRID") + corrid = in_corrid.as_numpy()[0][0] + + in_end = pb_utils.get_input_tensor_by_name(request, "END") + end = in_end.as_numpy()[0][0] + + if start and ready: + # intialize states + if self.decoding_method == 'fast_beam_search': + processed_len = cur_b_lens + state = [k2.RnntDecodingStream(self.decoding_graph), processed_len] + else: + state = [] + self.seq_states[corrid] = state + + if end and ready: + end_idx.add(batch_idx) + + if ready: + state = self.seq_states[corrid] + batch_idx2_corrid[batch_idx] = corrid + states_list.append(state) + + batch_idx += 1 + + encoder_out_array = np.zeros((batch_idx, encoder_max_len, self.encoder_dim), + dtype=self.data_type) + encoder_out_lens_array = np.zeros(batch_idx, dtype=np.int32) + + for i, t in enumerate(batch_encoder_out_list): + encoder_out_array[i, 0:t.shape[1]] = t + encoder_out_lens_array[i] = batch_encoder_lens_list[i] + + encoder_out = torch.from_numpy(encoder_out_array) + encoder_out_lens = torch.from_numpy(encoder_out_lens_array) + + if self.decoding_method == "fast_beam_search": + hyps_list = self.fast_beam_search(encoder_out, encoder_out_lens, states_list) + else: + hyps_list = self.greedy_search(encoder_out, encoder_out_lens, states_list) + + responses = [] + for i in range(len(hyps_list)): + hyp = hyps_list[i] + if hasattr(self.tokenizer, 'token_table'): + sent = [self.tokenizer.token_table[idx] for idx in hyp] + else: + sent = self.tokenizer.decode(hyp).split() + sent = np.array(sent) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", sent.astype(self.out0_dtype)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out_tensor_0]) + responses.append(inference_response) + # update states + corr = batch_idx2_corrid[i] + if i in end_idx: + del self.seq_states[corr] + else: + if self.decoding_method == 'fast_beam_search': + self.seq_states[corr][1] += batch_encoder_lens_list[i] # stream decoding state is updated in fast_beam_search + else: + self.seq_states[corr] = hyp + + assert len(requests) == len(responses) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') \ No newline at end of file diff --git a/triton/model_repo_streaming/scorer/config.pbtxt.template b/triton/model_repo_streaming/scorer/config.pbtxt.template new file mode 100755 index 000000000..ede46670d --- /dev/null +++ b/triton/model_repo_streaming/scorer/config.pbtxt.template @@ -0,0 +1,128 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "scorer" +backend: "python" +max_batch_size: MAX_BATCH + +sequence_batching{ + max_sequence_idle_microseconds: 5000000 + oldest { + max_candidate_sequences: 1024 + } + control_input [ + { + name: "START", + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [0, 1] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [0, 1] + } + ] + }, + { + name: "CORRID", + control [ + { + kind: CONTROL_SEQUENCE_CORRID + data_type: TYPE_UINT64 + } + ] + }, + { + name: "END", + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [0, 1] + } + ] + } + ] +} + + +parameters [ + { + key: "context_size", + value: { string_value: "DECODER_CONTEXT_SIZE"} + }, + { + key: "tokenizer_file", + value: { string_value: "TOKENIZER_FILE"} + }, + { + key: "FORCE_CPU_ONLY_INPUT_TENSORS", + value: {string_value:"yes"} + }, + { + key: "decoding_method", + value: { string_value: "greedy_search"} # fast_beam_search + }, + { + key: "beam", + value: { string_value: "4"} + }, + { + key: "max_contexts", + value: { string_value: "4"} + }, + { + key: "max_states", + value: { string_value: "32"} + }, + { + key: "temperature", + value: { string_value: "1.0"} + } +] + + +input [ + { + name: "encoder_out" + data_type: TYPE_FP16 + dims: [-1, ENCODER_DIM] + }, + { + name: "encoder_out_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [1] + } +] + +instance_group [ + { + count: SCORER_INSTANCE_NUM + kind: KIND_CPU + } + ] diff --git a/triton/model_repo_streaming/transducer/1/.gitkeep b/triton/model_repo_streaming/transducer/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo/conformer_transducer/config.pbtxt b/triton/model_repo_streaming/transducer/config.pbtxt.template similarity index 70% rename from triton/model_repo/conformer_transducer/config.pbtxt rename to triton/model_repo_streaming/transducer/config.pbtxt.template index cb1532cbb..c506ec9e0 100755 --- a/triton/model_repo/conformer_transducer/config.pbtxt +++ b/triton/model_repo_streaming/transducer/config.pbtxt.template @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: "conformer_transducer" +name: "transducer" platform: "ensemble" -max_batch_size: 512 #MAX_BATCH +max_batch_size: MAX_BATCH input [ { name: "WAV" - data_type: TYPE_FP32 + data_type: TYPE_FP16 dims: [-1] }, { @@ -51,44 +51,44 @@ ensemble_scheduling { value: "WAV_LENS" } output_map { - key: "speech__0" - value: "SPEECH" + key: "x" + value: "x" } output_map { - key: "speech_lengths__1" - value: "SPEECH_LENGTHS" + key: "x_lens" + value: "x_lens" } }, { model_name: "encoder" model_version: -1 input_map { - key: "speech__0" - value: "SPEECH" + key: "x" + value: "x" } input_map { - key: "speech_lengths__1" - value: "SPEECH_LENGTHS" + key: "x_lens" + value: "x_lens" } output_map { - key: "encoder_out__0" - value: "encoder_out__0" + key: "encoder_out" + value: "encoder_out" } output_map { - key: "encoder_out_lens__1" - value: "encoder_out_lens__1" + key: "encoder_out_lens" + value: "encoder_out_lens" } }, { - model_name: "greedy_search" + model_name: "scorer" model_version: -1 input_map { - key: "encoder_out__0" - value: "encoder_out__0" + key: "encoder_out" + value: "encoder_out" } input_map { - key: "encoder_out_lens__1" - value: "encoder_out_lens__1" + key: "encoder_out_lens" + value: "encoder_out_lens" } output_map { key: "OUTPUT0" @@ -96,4 +96,4 @@ ensemble_scheduling { } } ] -} +} \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/decoder/1/.gitkeep b/triton/model_repo_streaming_zipformer/decoder/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming_zipformer/decoder/config.pbtxt.template b/triton/model_repo_streaming_zipformer/decoder/config.pbtxt.template new file mode 120000 index 000000000..0d552cd5c --- /dev/null +++ b/triton/model_repo_streaming_zipformer/decoder/config.pbtxt.template @@ -0,0 +1 @@ +../../model_repo_streaming/decoder/config.pbtxt.template \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/encoder/1/.gitkeep b/triton/model_repo_streaming_zipformer/encoder/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming_zipformer/encoder/config.pbtxt.template b/triton/model_repo_streaming_zipformer/encoder/config.pbtxt.template new file mode 100755 index 000000000..454fb2f17 --- /dev/null +++ b/triton/model_repo_streaming_zipformer/encoder/config.pbtxt.template @@ -0,0 +1,113 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "encoder" +backend: "onnxruntime" +default_model_filename: "encoder.onnx" + +max_batch_size: MAX_BATCH + +sequence_batching{ + max_sequence_idle_microseconds: 5000000 + oldest { + max_candidate_sequences: 1024 + max_queue_delay_microseconds: 5000 + } + control_input [ + ] + state [ + { + input_name: "attn_cache" + output_name: "new_attn_cache" + data_type: TYPE_FP16 + dims: [ ENCODER_LEFT_CONTEXT, ENCODER_LAYERS_3X, ENCODER_DIM_HALF ] + initial_state: { + data_type: TYPE_FP16 + dims: [ ENCODER_LEFT_CONTEXT, ENCODER_LAYERS_3X, ENCODER_DIM_HALF ] + zero_data: true + name: "initial state" + } + }, + { + input_name: "cnn_cache" + output_name: "new_cnn_cache" + data_type: TYPE_FP16 + dims: [ ENCODER_LAYERS_2X, ENCODER_DIM , CNN_MODULE_KERNEL_MINUS_ONE] + initial_state: { + data_type: TYPE_FP16 + dims: [ ENCODER_LAYERS_2X, ENCODER_DIM , CNN_MODULE_KERNEL_MINUS_ONE] + zero_data: true + name: "initial state" + } + }, + + { + input_name: "avg_cache" + output_name: "new_avg_cache" + data_type: TYPE_FP16 + dims: [ ENCODER_LAYERS, ENCODER_DIM ] + initial_state: { + data_type: TYPE_FP16 + dims: [ ENCODER_LAYERS, ENCODER_DIM ] + zero_data: true + name: "initial state" + } + }, + + { + input_name: "len_cache" + output_name: "new_len_cache" + data_type: TYPE_INT64 + dims: [ ENCODER_LAYERS ] + initial_state: { + data_type: TYPE_INT64 + dims: [ ENCODER_LAYERS ] + zero_data: true + name: "initial state" + } + } + ] +} +input [ + { + name: "x" + data_type: TYPE_FP16 + dims: [-1, 80] + }, + { + name: "x_lens" + data_type: TYPE_INT64 + dims: [ 1 ] + reshape: { shape: [] } + } +] +output [ + { + name: "encoder_out" + data_type: TYPE_FP16 + dims: [-1, -1] + }, + { + name: "encoder_out_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [] } + } +] +instance_group [ + { + count: ENCODER_INSTANCE_NUM + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/feature_extractor/1/model.py b/triton/model_repo_streaming_zipformer/feature_extractor/1/model.py new file mode 120000 index 000000000..e79619511 --- /dev/null +++ b/triton/model_repo_streaming_zipformer/feature_extractor/1/model.py @@ -0,0 +1 @@ +../../../model_repo_streaming/feature_extractor/1/model.py \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/feature_extractor/config.pbtxt.template b/triton/model_repo_streaming_zipformer/feature_extractor/config.pbtxt.template new file mode 120000 index 000000000..2114a93bb --- /dev/null +++ b/triton/model_repo_streaming_zipformer/feature_extractor/config.pbtxt.template @@ -0,0 +1 @@ +../../model_repo_streaming/feature_extractor/config.pbtxt.template \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/joiner/1/.gitkeep b/triton/model_repo_streaming_zipformer/joiner/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming_zipformer/joiner/config.pbtxt.template b/triton/model_repo_streaming_zipformer/joiner/config.pbtxt.template new file mode 120000 index 000000000..5f44e67be --- /dev/null +++ b/triton/model_repo_streaming_zipformer/joiner/config.pbtxt.template @@ -0,0 +1 @@ +../../model_repo_streaming/joiner/config.pbtxt.template \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/scorer/1/model.py b/triton/model_repo_streaming_zipformer/scorer/1/model.py new file mode 120000 index 000000000..b25b7dbf8 --- /dev/null +++ b/triton/model_repo_streaming_zipformer/scorer/1/model.py @@ -0,0 +1 @@ +../../../model_repo_streaming/scorer/1/model.py \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/scorer/config.pbtxt.template b/triton/model_repo_streaming_zipformer/scorer/config.pbtxt.template new file mode 120000 index 000000000..e8aa74c4e --- /dev/null +++ b/triton/model_repo_streaming_zipformer/scorer/config.pbtxt.template @@ -0,0 +1 @@ +../../model_repo_streaming/scorer/config.pbtxt.template \ No newline at end of file diff --git a/triton/model_repo_streaming_zipformer/transducer/1/.gitkeep b/triton/model_repo_streaming_zipformer/transducer/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo_streaming_zipformer/transducer/config.pbtxt.template b/triton/model_repo_streaming_zipformer/transducer/config.pbtxt.template new file mode 120000 index 000000000..1b4226572 --- /dev/null +++ b/triton/model_repo_streaming_zipformer/transducer/config.pbtxt.template @@ -0,0 +1 @@ +../../model_repo_streaming/transducer/config.pbtxt.template \ No newline at end of file diff --git a/triton/requirements.txt b/triton/requirements.txt new file mode 100644 index 000000000..8049042dc --- /dev/null +++ b/triton/requirements.txt @@ -0,0 +1,3 @@ +onnx +onnxruntime-gpu +onnxmltools diff --git a/triton/scripts/build_librispeech_pruned_transducer_stateless3_offline.sh b/triton/scripts/build_librispeech_pruned_transducer_stateless3_offline.sh new file mode 100644 index 000000000..b3db5134f --- /dev/null +++ b/triton/scripts/build_librispeech_pruned_transducer_stateless3_offline.sh @@ -0,0 +1,126 @@ +#!/bin/bash +stage=-2 +stop_stage=2 + +# whether to use convert ONNX model to FP16 +fp16=true + +pretrained_model_dir=/workspace/icefall/egs/librispeech/ASR/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +model_repo_path=./model_repo_offline + +# modify model specific parameters according to $pretrained_model_dir/exp/onnx_export.log +VOCAB_SIZE=500 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 + +ENCODER_LAYERS=12 +ENCODER_DIM=512 +CNN_MODULE_KERNEL=31 + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=512 +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=2 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + + +icefall_dir=/workspace/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/librispeech/ASR/pruned_transducer_stateless3 + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip download pretrained model" + else + pushd $icefall_dir/egs/librispeech/ASR/ + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + pushd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + git lfs pull --include "exp/pretrained-iter-1224000-avg-14.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + popd + ln -s ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/pretrained-iter-1224000-avg-14.pt ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/epoch-9999.pt + popd + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + cd $recipe_dir + ./export-onnx.py \ + --bpe-model $TOKENIZER_FILE \ + --epoch 9999 \ + --avg 1 \ + --exp-dir $pretrained_model_dir/exp/ + cd - +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "auto gen config.pbtxt" + dirs="encoder decoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE $model_repo_path/scorer/ + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + + done + +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + if [ $fp16 == true ]; then + echo "Convert to FP16..." + polygraphy convert --fp-to-fp16 -o $pretrained_model_dir/exp/encoder_fp16.onnx $pretrained_model_dir/exp/encoder-epoch-9999-avg-1.onnx + polygraphy convert --fp-to-fp16 -o $pretrained_model_dir/exp/decoder_fp16.onnx $pretrained_model_dir/exp/decoder-epoch-9999-avg-1.onnx + polygraphy convert --fp-to-fp16 -o $pretrained_model_dir/exp/joiner_fp16.onnx $pretrained_model_dir/exp/joiner-epoch-9999-avg-1.onnx + + cp $pretrained_model_dir/exp/encoder_fp16.onnx $model_repo_path/encoder/1/encoder.onnx + cp $pretrained_model_dir/exp/decoder_fp16.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner_fp16.onnx $model_repo_path/joiner/1/joiner.onnx + else + cp $pretrained_model_dir/exp/encoder-epoch-9999-avg-1.onnx $model_repo_path/encoder/1/encoder.onnx + cp $pretrained_model_dir/exp/decoder-epoch-9999-avg-1.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner-epoch-9999-avg-1.onnx $model_repo_path/joiner/1/joiner.onnx + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi diff --git a/triton/scripts/build_librispeech_pruned_transducer_stateless3_offline_trt.sh b/triton/scripts/build_librispeech_pruned_transducer_stateless3_offline_trt.sh new file mode 100644 index 000000000..f5bffafcb --- /dev/null +++ b/triton/scripts/build_librispeech_pruned_transducer_stateless3_offline_trt.sh @@ -0,0 +1,125 @@ +#!/bin/bash +stage=1 +stop_stage=3 + + +pretrained_model_dir=/workspace/icefall/egs/librispeech/ASR/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 +model_repo_path=./model_repo_offline + +# modify model specific parameters according to $pretrained_model_dir/exp/onnx_export.log +VOCAB_SIZE=500 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 + +ENCODER_LAYERS=12 +ENCODER_DIM=512 +CNN_MODULE_KERNEL=31 + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=512 +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=2 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + + +icefall_dir=/workspace/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/librispeech/ASR/pruned_transducer_stateless3 + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip download pretrained model" + else + pushd $icefall_dir/egs/librispeech/ASR/ + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + pushd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13 + git lfs pull --include "exp/pretrained-iter-1224000-avg-14.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + popd + ln -s ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/pretrained-iter-1224000-avg-14.pt ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/epoch-9999.pt + popd + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + cd ${recipe_dir} + ./export-onnx.py \ + --bpe-model $TOKENIZER_FILE \ + --epoch 9999 \ + --avg 1 \ + --exp-dir $pretrained_model_dir/exp/ + cd - +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Buiding TRT engine..." + bash scripts/build_trt.sh $MAX_BATCH $pretrained_model_dir/exp/encoder-epoch-9999-avg-1.onnx $model_repo_path/encoder/1/encoder.trt +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "auto gen config.pbtxt" + dirs="encoder decoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE $model_repo_path/scorer/ + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + + done + + # modify TRT specific parameters + sed -i "s|TYPE_INT64|TYPE_INT32|g" $model_repo_path/feature_extractor/config.pbtxt + sed -i "s|TYPE_INT64|TYPE_INT32|g" $model_repo_path/encoder/config.pbtxt + sed -i "s|TYPE_INT64|TYPE_INT32|g" $model_repo_path/scorer/config.pbtxt + sed -i "s|onnxruntime|tensorrt|g" $model_repo_path/encoder/config.pbtxt + sed -i "s|encoder.onnx|encoder.trt|g" $model_repo_path/encoder/config.pbtxt + +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + cp $pretrained_model_dir/exp/decoder-epoch-9999-avg-1.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner-epoch-9999-avg-1.onnx $model_repo_path/joiner/1/joiner.onnx +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi diff --git a/triton/scripts/build_librispeech_pruned_transducer_stateless3_streaming.sh b/triton/scripts/build_librispeech_pruned_transducer_stateless3_streaming.sh new file mode 100644 index 000000000..ff64a87bc --- /dev/null +++ b/triton/scripts/build_librispeech_pruned_transducer_stateless3_streaming.sh @@ -0,0 +1,132 @@ +#!/bin/bash +stage=-2 +stop_stage=2 + + +pretrained_model_dir=/workspace/icefall/egs/librispeech/ASR/icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625 +model_repo_path=./model_repo_streaming + +# modify model specific parameters according to $pretrained_model_dir/exp/onnx_export.log +VOCAB_SIZE=500 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 + +ENCODER_LAYERS=12 +ENCODER_DIM=512 +CNN_MODULE_KERNEL=31 + +# for streaming ASR +ENCODER_LEFT_CONTEXT=64 +ENCODER_RIGHT_CONTEXT=4 + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=512 +# for streaming ASR +CNN_MODULE_KERNEL_MINUS_ONE=$(($CNN_MODULE_KERNEL - 1)) +DECODE_CHUNK_SIZE=16 +# decode_window_size = (decode_chunk_size + 2 + decode_right_context) * subsampling_factor + 3 +DECODE_WINDOW_SIZE=$((($DECODE_CHUNK_SIZE+2+$ENCODER_RIGHT_CONTEXT)*4+3)) +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=2 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + + +icefall_dir=/workspace/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/librispeech/ASR/pruned_transducer_stateless3 + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip download pretrained model" + else + pushd $icefall_dir/egs/librispeech/ASR/ + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625 + pushd icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625 + git lfs pull --include "exp/pretrained-epoch-25-avg-12.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + popd + ln -s ./icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/exp/pretrained-epoch-25-avg-12.pt ./icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/exp/epoch-999.pt + popd + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + cp scripts/*onnx*.py ${recipe_dir}/ + cd ${recipe_dir} + ./export_onnx.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --tokenizer-file $TOKENIZER_FILE \ + --epoch 999 \ + --avg 1 \ + --streaming-model 1\ + --causal-convolution 1 \ + --onnx 1 \ + --left-context $ENCODER_LEFT_CONTEXT \ + --right-context $ENCODER_RIGHT_CONTEXT \ + --fp16 + cd - +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "auto gen config.pbtxt" + dirs="encoder decoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE $model_repo_path/scorer/ + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|CNN_MODULE_KERNEL_MINUS_ONE|${CNN_MODULE_KERNEL_MINUS_ONE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODE_WINDOW_SIZE|${DECODE_WINDOW_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODE_CHUNK_SIZE|${DECODE_CHUNK_SIZE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + + done + +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cp $pretrained_model_dir/exp/encoder_fp16.onnx $model_repo_path/encoder/1/encoder.onnx + cp $pretrained_model_dir/exp/decoder_fp16.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner_fp16.onnx $model_repo_path/joiner/1/joiner.onnx +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi diff --git a/triton/scripts/build_librispeech_pruned_transducer_stateless7_offline.sh b/triton/scripts/build_librispeech_pruned_transducer_stateless7_offline.sh new file mode 100644 index 000000000..7c674ec0d --- /dev/null +++ b/triton/scripts/build_librispeech_pruned_transducer_stateless7_offline.sh @@ -0,0 +1,115 @@ +#!/bin/bash +stage=0 +stop_stage=2 + +# change to your own model directory +pretrained_model_dir=/mnt/samsung-t7/wend/github/icefall/egs/librispeech/ASR/pruned_transducer_stateless7/exp/ +model_repo_path=./model_repo_offline + +# modify model specific parameters according to $pretrained_model_dir/exp/onnx_export.log +VOCAB_SIZE=500 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 + +ENCODER_LAYERS=12 +ENCODER_DIM=512 +CNN_MODULE_KERNEL=31 + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE_OR_DIR=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE_OR_DIR=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=512 +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=2 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + + +icefall_dir=/mnt/samsung-t7/wend/asr/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/librispeech/ASR/pruned_transducer_stateless7 + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip downloading pretrained model" + else + pushd $icefall_dir/egs/librispeech/ASR/ + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 + pushd icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11 + git lfs pull --include "exp/pretrained-epoch-30-avg-9.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + ln -rs icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/exp/pretrained-epoch-30-avg-9.pt icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/exp/epoch-999.pt + popd + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + cd ${recipe_dir} + ./export-onnx.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --bpe-model $TOKENIZER_FILE_OR_DIR \ + --epoch 999 \ + --avg 1 \ + --use-averaged-model 0 + cd - +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "auto gen config.pbtxt" + dirs="encoder decoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE_OR_DIR $model_repo_path/scorer/ + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE_OR_DIR) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + + done + +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cp $pretrained_model_dir/exp/encoder-epoch-999-avg-1.onnx $model_repo_path/encoder/1/encoder.onnx + cp $pretrained_model_dir/exp/decoder-epoch-999-avg-1.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner-epoch-999-avg-1.onnx $model_repo_path/joiner/1/joiner.onnx + cp $TOKENIZER_FILE /workspace/ +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi + diff --git a/triton/scripts/build_librispeech_pruned_transducer_stateless7_streaming.sh b/triton/scripts/build_librispeech_pruned_transducer_stateless7_streaming.sh new file mode 100755 index 000000000..bec487a66 --- /dev/null +++ b/triton/scripts/build_librispeech_pruned_transducer_stateless7_streaming.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +set -e # exit on error + +stage=-2 +stop_stage=2 + + +pretrained_model_dir=/workspace/icefall/egs/librispeech/ASR/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 +model_repo_path=$(pwd)/model_repo_streaming_zipformer_test +#conformer_streaming_model_repo_path=$(pwd)/model_repo_streaming + +# modify model specific parameters according to $pretrained_model_dir/exp/onnx_export.log +VOCAB_SIZE=500 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 + +#ENCODER_LAYERS=24 + +ENCODER_LAYERS=15 +ENCODER_LAYERS_2X=$((2*$ENCODER_LAYERS)) +ENCODER_LAYERS_3X=$((3*$ENCODER_LAYERS)) + + +ENCODER_DIM=384 +ENCODER_DIM_HALF=$(($ENCODER_DIM/2)) +CNN_MODULE_KERNEL=31 + +# for streaming ASR +ENCODER_LEFT_CONTEXT=64 +ENCODER_RIGHT_CONTEXT=2 + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=512 +# for streaming ASR +CNN_MODULE_KERNEL_MINUS_ONE=$(($CNN_MODULE_KERNEL - 1)) +DECODE_CHUNK_SIZE=16 +# decode_window_size = (decode_chunk_size + 2 + decode_right_context) * subsampling_factor + 3 +DECODE_WINDOW_SIZE=$((($DECODE_CHUNK_SIZE+2+$ENCODER_RIGHT_CONTEXT)*4+3)) +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=2 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + +icefall_dir=/workspace/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/librispeech/ASR/pruned_transducer_stateless7_streaming + +FP_32=true + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip download pretrained model" + else + + cd $icefall_dir/egs/librispeech/ASR/ + MODEL_LOCATION=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + GIT_LFS_SKIP_SMUDGE=1 git clone $MODEL_LOCATION + cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29 + git lfs pull --include "exp/epoch-30.pt" + git lfs pull --include "data/lang_bpe_500/bpe.model" + + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + + cd ${recipe_dir} + + if [ "$FP_32" = false ] ; then + + echo "Using FP16" + + ./export.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --bpe-model $TOKENIZER_FILE \ + --use-averaged-model False \ + --epoch 30 \ + --avg 1 \ + --fp16 \ + --onnx-triton 1 \ + --onnx 1 + + sed -i "s|TYPE_FP32|TYPE_FP16|g" "${model_repo_path}"/*/config.pbtxt.template + + else + + echo "Using FP32" + + ./export.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --bpe-model $TOKENIZER_FILE \ + --use-averaged-model False \ + --epoch 30 \ + --avg 1 \ + --onnx-triton 1 \ + --onnx 1 + + sed -i "s|TYPE_FP16|TYPE_FP32|g" "${model_repo_path}"/*/config.pbtxt.template + + fi + + cd - +fi + + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "auto gen config.pbtxt" + dirs="decoder encoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE $model_repo_path/scorer/ + + + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + done + + sed -i "s|ENCODER_LAYERS_2X|${ENCODER_LAYERS_2X}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_LAYERS_3X|${ENCODER_LAYERS_3X}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_DIM_HALF|${ENCODER_DIM_HALF}|g" $model_repo_path/*/config.pbtxt + + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/*/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/*/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/*/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/*/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/*/config.pbtxt + sed -i "s|CNN_MODULE_KERNEL_MINUS_ONE|${CNN_MODULE_KERNEL_MINUS_ONE}|g" $model_repo_path/*/config.pbtxt + sed -i "s|DECODE_WINDOW_SIZE|${DECODE_WINDOW_SIZE}|g" $model_repo_path/*/config.pbtxt + sed -i "s|DECODE_CHUNK_SIZE|${DECODE_CHUNK_SIZE}|g" $model_repo_path/*/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/*/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/*/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/*/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/*/config.pbtxt + + sed -i "s|ENCODER_LAYERS_2X|${ENCODER_LAYERS_2X}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_LAYERS_3X|${ENCODER_LAYERS_3X}|g" $model_repo_path/*/config.pbtxt + sed -i "s|ENCODER_DIM_HALF|${ENCODER_DIM_HALF}|g" $model_repo_path/*/config.pbtxt + + +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + + if [ "$FP_32" = true ] ; then + + cp -f $pretrained_model_dir/exp/encoder.onnx $model_repo_path/encoder/1/encoder.onnx + cp -f $pretrained_model_dir/exp/decoder.onnx $model_repo_path/decoder/1/decoder.onnx + cp -f $pretrained_model_dir/exp/joiner.onnx $model_repo_path/joiner/1/joiner.onnx + + else + + cp -f $pretrained_model_dir/exp/encoder_fp16.onnx $model_repo_path/encoder/1/encoder.onnx + cp -f $pretrained_model_dir/exp/decoder_fp16.onnx $model_repo_path/decoder/1/decoder.onnx + cp -f $pretrained_model_dir/exp/joiner_fp16.onnx $model_repo_path/joiner/1/joiner.onnx + + + fi + + + +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi diff --git a/triton/scripts/build_trt.sh b/triton/scripts/build_trt.sh new file mode 100644 index 000000000..bc2bdba6c --- /dev/null +++ b/triton/scripts/build_trt.sh @@ -0,0 +1,35 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# paramters for TRT engines +MIN_BATCH=1 +OPT_BATCH=4 +MAX_BATCH=$1 +onnx_model=$2 +trt_model=$3 + +ENC_MIN_LEN=16 +ENC_OPT_LEN=512 +ENC_MAX_LEN=2000 + +/usr/src/tensorrt/bin/trtexec \ +--onnx=$onnx_model \ +--minShapes=x:${MIN_BATCH}x${ENC_MIN_LEN}x80,x_lens:${MIN_BATCH} \ +--optShapes=x:${OPT_BATCH}x${ENC_OPT_LEN}x80,x_lens:${OPT_BATCH} \ +--maxShapes=x:${MAX_BATCH}x${ENC_MAX_LEN}x80,x_lens:${MAX_BATCH} \ +--fp16 \ +--loadInputs=x:scripts/test_features/input_tensor_fp32.dat,x_lens:scripts/test_features/shape.bin \ +--shapes=x:1x663x80,x_lens:1 \ +--saveEngine=$trt_model + diff --git a/triton/scripts/build_wenetspeech_pruned_transducer_stateless5_streaming.sh b/triton/scripts/build_wenetspeech_pruned_transducer_stateless5_streaming.sh new file mode 100644 index 000000000..7e8eb9d0b --- /dev/null +++ b/triton/scripts/build_wenetspeech_pruned_transducer_stateless5_streaming.sh @@ -0,0 +1,129 @@ +#!/bin/bash +stage=-2 +stop_stage=2 + +pretrained_model_dir=/workspace/icefall/egs/wenetspeech/ASR/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming +model_repo_path=./model_repo_streaming + +# modify model specific parameters according to $pretrained_model_dir/exp/onnx_export.log +VOCAB_SIZE=5537 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 + +ENCODER_LAYERS=24 +ENCODER_DIM=384 +CNN_MODULE_KERNEL=31 + +# for streaming ASR +ENCODER_LEFT_CONTEXT=32 +ENCODER_RIGHT_CONTEXT=2 + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=512 +# for streaming ASR +CNN_MODULE_KERNEL_MINUS_ONE=$(($CNN_MODULE_KERNEL - 1)) +DECODE_CHUNK_SIZE=16 +# decode_window_size = (decode_chunk_size + 2 + decode_right_context) * subsampling_factor + 3 +DECODE_WINDOW_SIZE=$((($DECODE_CHUNK_SIZE+2+$ENCODER_RIGHT_CONTEXT)*4+3)) +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=2 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + +icefall_dir=/workspace/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/wenetspeech/ASR/pruned_transducer_stateless5 + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip download pretrained model" + else + pushd $icefall_dir/egs/wenetspeech/ASR/ + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + pushd icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming + git lfs pull --include "exp/pretrained_epoch_7_avg_1.pt,data/lang_char/Linv.pt" + popd + ln -s $icefall_dir/egs/wenetspeech/ASR/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp/pretrained_epoch_7_avg_1.pt $icefall_dir/egs/wenetspeech/ASR/icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming/exp/epoch-999.pt + popd + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + cp scripts/*onnx*.py ${recipe_dir}/ + cd ${recipe_dir} + ./export_onnx.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --tokenizer-file $TOKENIZER_FILE \ + --epoch 999 \ + --avg 1 \ + --streaming-model 1\ + --causal-convolution 1 \ + --onnx 1 \ + --left-context $ENCODER_LEFT_CONTEXT \ + --right-context $ENCODER_RIGHT_CONTEXT \ + --fp16 + cd - +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "auto gen config.pbtxt" + dirs="encoder decoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE $model_repo_path/scorer/ + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|CNN_MODULE_KERNEL_MINUS_ONE|${CNN_MODULE_KERNEL_MINUS_ONE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODE_WINDOW_SIZE|${DECODE_WINDOW_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODE_CHUNK_SIZE|${DECODE_CHUNK_SIZE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + + done + +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cp $pretrained_model_dir/exp/encoder_fp16.onnx $model_repo_path/encoder/1/encoder.onnx + cp $pretrained_model_dir/exp/decoder_fp16.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner_fp16.onnx $model_repo_path/joiner/1/joiner.onnx +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi diff --git a/triton/scripts/build_wenetspeech_zipformer_offline_trt.sh b/triton/scripts/build_wenetspeech_zipformer_offline_trt.sh new file mode 100644 index 000000000..638a2d435 --- /dev/null +++ b/triton/scripts/build_wenetspeech_zipformer_offline_trt.sh @@ -0,0 +1,131 @@ +#!/bin/bash +stage=-1 +stop_stage=3 + +export CUDA_VISIBLE_DEVICES=1 + +pretrained_model_dir=/workspace/icefall-asr-zipformer-wenetspeech-20230615 +model_repo_path=./model_repo_offline + +# modify model specific parameters according to $pretrained_model_dir/exp/ log files +VOCAB_SIZE=5537 + +DECODER_CONTEXT_SIZE=2 +DECODER_DIM=512 +ENCODER_DIM=512 # max(_to_int_tuple(params.encoder_dim) + + +if [ -d "$pretrained_model_dir/data/lang_char" ] +then + echo "pretrained model using char" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_char +else + echo "pretrained model using bpe" + TOKENIZER_FILE=$pretrained_model_dir/data/lang_bpe_500/bpe.model +fi + +MAX_BATCH=16 +# model instance num +FEATURE_EXTRACTOR_INSTANCE_NUM=2 +ENCODER_INSTANCE_NUM=1 +JOINER_INSTANCE_NUM=1 +DECODER_INSTANCE_NUM=1 +SCORER_INSTANCE_NUM=2 + + +icefall_dir=/workspace/icefall +export PYTHONPATH=$PYTHONPATH:$icefall_dir +recipe_dir=$icefall_dir/egs/wenetspeech/ASR/zipformer + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + if [ -d "$pretrained_model_dir" ] + then + echo "skip download pretrained model" + else + echo "downloading pretrained model" + cd /workspace + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-wenetspeech-20230615 + pushd icefall-asr-zipformer-wenetspeech-20230615 + git lfs pull --include "exp/pretrained.pt" + ln -s ./exp/pretrained.pt ./exp/epoch-9999.pt + popd + cd - + fi +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "export onnx" + cd ${recipe_dir} + # WAR: please comment https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer/zipformer.py#L1422-L1427 + # if you would like to use the exported onnx to build trt engine later. + python3 ./export-onnx.py \ + --tokens $TOKENIZER_FILE/tokens.txt \ + --use-averaged-model 0 \ + --epoch 9999 \ + --avg 1 \ + --exp-dir $pretrained_model_dir/exp/ \ + --num-encoder-layers "2,2,3,4,3,2" \ + --downsampling-factor "1,2,4,8,4,2" \ + --feedforward-dim "512,768,1024,1536,1024,768" \ + --num-heads "4,4,4,8,4,4" \ + --encoder-dim "192,256,384,512,384,256" \ + --query-head-dim 32 \ + --value-head-dim 12 \ + --causal False || exit 1 + + cd - +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "auto gen config.pbtxt" + dirs="encoder decoder feature_extractor joiner scorer transducer" + + if [ ! -d $model_repo_path ]; then + echo "Please cd to $model_repo_path" + exit 1 + fi + + cp -r $TOKENIZER_FILE $model_repo_path/scorer/ + TOKENIZER_FILE=$model_repo_path/scorer/$(basename $TOKENIZER_FILE) + for dir in $dirs + do + cp $model_repo_path/$dir/config.pbtxt.template $model_repo_path/$dir/config.pbtxt + + sed -i "s|VOCAB_SIZE|${VOCAB_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_CONTEXT_SIZE|${DECODER_CONTEXT_SIZE}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_DIM|${DECODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LAYERS|${ENCODER_LAYERS}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_DIM|${ENCODER_DIM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_LEFT_CONTEXT|${ENCODER_LEFT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_RIGHT_CONTEXT|${ENCODER_RIGHT_CONTEXT}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|TOKENIZER_FILE|${TOKENIZER_FILE}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|MAX_BATCH|${MAX_BATCH}|g" $model_repo_path/$dir/config.pbtxt + + sed -i "s|FEATURE_EXTRACTOR_INSTANCE_NUM|${FEATURE_EXTRACTOR_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|ENCODER_INSTANCE_NUM|${ENCODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|JOINER_INSTANCE_NUM|${JOINER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|DECODER_INSTANCE_NUM|${DECODER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + sed -i "s|SCORER_INSTANCE_NUM|${SCORER_INSTANCE_NUM}|g" $model_repo_path/$dir/config.pbtxt + done +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cp $pretrained_model_dir/exp/encoder-epoch-9999-avg-1.onnx $model_repo_path/encoder/1/encoder.onnx + cp $pretrained_model_dir/exp/decoder-epoch-9999-avg-1.onnx $model_repo_path/decoder/1/decoder.onnx + cp $pretrained_model_dir/exp/joiner-epoch-9999-avg-1.onnx $model_repo_path/joiner/1/joiner.onnx +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "Buiding TRT engine..., skip the stage if you would like to use onnxruntime" + polygraphy surgeon sanitize $pretrained_model_dir/exp/encoder-epoch-9999-avg-1.onnx --fold-constant -o $pretrained_model_dir/exp/encoder.onnx + bash scripts/build_trt.sh $MAX_BATCH $pretrained_model_dir/exp/encoder.onnx $model_repo_path/encoder/1/encoder.trt || exit 1 + + sed -i "s|onnxruntime|tensorrt|g" $model_repo_path/encoder/config.pbtxt + sed -i "s|encoder.onnx|encoder.trt|g" $model_repo_path/encoder/config.pbtxt +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + tritonserver --model-repository=$model_repo_path --pinned-memory-pool-byte-size=512000000 --cuda-memory-pool-byte-size=0:1024000000 --http-port 10086 +fi diff --git a/triton/scripts/conformer_triton.py b/triton/scripts/conformer_triton.py deleted file mode 100755 index 553c050d5..000000000 --- a/triton/scripts/conformer_triton.py +++ /dev/null @@ -1,1066 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2021 University of Chinese Academy of Sciences (author: Han Zhu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import math -import warnings -from typing import Optional, Tuple - -import torch -from encoder_interface import EncoderInterface -from scaling import ( - ActivationBalancer, - BasicNorm, - DoubleSwish, - ScaledConv1d, - ScaledConv2d, - ScaledLinear, -) -from torch import Tensor, nn - -#from icefall.utils import make_pad_mask -def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """ - Args: - lengths: - A 1-D tensor containing sentence lengths. - Returns: - Return a 2-D bool tensor, where masked positions - are filled with `True` and non-masked positions are - filled with `False`. - - >>> lengths = torch.tensor([1, 3, 2, 5]) - >>> make_pad_mask(lengths) - tensor([[False, True, True, True, True], - [False, False, False, True, True], - [False, False, True, True, True], - [False, False, False, False, False]]) - """ - assert lengths.ndim == 1, lengths.ndim - - max_len = lengths.max() - n = lengths.size(0) - - expaned_lengths = torch.arange(max_len).expand(n, max_len).to(lengths) - - return expaned_lengths >= lengths.unsqueeze(1) - -class Conformer(EncoderInterface): - """ - Args: - num_features (int): Number of input features - subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) - d_model (int): attention dimension, also the output dimension - nhead (int): number of head - dim_feedforward (int): feedforward dimention - num_encoder_layers (int): number of encoder layers - dropout (float): dropout rate - layer_dropout (float): layer-dropout rate. - cnn_module_kernel (int): Kernel size of convolution module - vgg_frontend (bool): whether to use vgg frontend. - """ - - def __init__( - self, - num_features: int, - subsampling_factor: int = 4, - d_model: int = 256, - nhead: int = 4, - dim_feedforward: int = 2048, - num_encoder_layers: int = 12, - dropout: float = 0.1, - layer_dropout: float = 0.075, - cnn_module_kernel: int = 31, - ) -> None: - super(Conformer, self).__init__() - - self.num_features = num_features - self.subsampling_factor = subsampling_factor - if subsampling_factor != 4: - raise NotImplementedError("Support only 'subsampling_factor=4'.") - - # self.encoder_embed converts the input of shape (N, T, num_features) - # to the shape (N, T//subsampling_factor, d_model). - # That is, it does two things simultaneously: - # (1) subsampling: T -> T//subsampling_factor - # (2) embedding: num_features -> d_model - self.encoder_embed = Conv2dSubsampling(num_features, d_model) - - self.encoder_pos = RelPositionalEncoding(d_model, dropout) - - encoder_layer = ConformerEncoderLayer( - d_model, - nhead, - dim_feedforward, - dropout, - layer_dropout, - cnn_module_kernel, - ) - self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers) - - def forward( - self, x: torch.Tensor, x_lens: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Args: - x: - The input tensor. Its shape is (batch_size, seq_len, feature_dim). - x_lens: - A tensor of shape (batch_size,) containing the number of frames in - `x` before padding. - warmup: - A floating point value that gradually increases from 0 throughout - training; when it is >= 1.0 we are "fully warmed up". It is used - to turn modules on sequentially. - Returns: - Return a tuple containing 2 tensors: - - embeddings: its shape is (batch_size, output_seq_len, d_model) - - lengths, a tensor of shape (batch_size,) containing the number - of frames in `embeddings` before padding. - """ - warmup = 1.0 - x = self.encoder_embed(x) - x, pos_emb = self.encoder_pos(x) - x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) - - # Caution: We assume the subsampling factor is 4! - - # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning - # - # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 - lengths = (((x_lens - 1) >> 1) - 1) >> 1 - - assert x.size(0) == lengths.max().item() - mask = make_pad_mask(lengths) - - x = self.encoder( - x, pos_emb, src_key_padding_mask=mask, warmup=warmup - ) # (T, N, C) - - x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) - - return x, lengths - - -class ConformerEncoderLayer(nn.Module): - """ - ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks. - See: "Conformer: Convolution-augmented Transformer for Speech Recognition" - - Args: - d_model: the number of expected features in the input (required). - nhead: the number of heads in the multiheadattention models (required). - dim_feedforward: the dimension of the feedforward network model (default=2048). - dropout: the dropout value (default=0.1). - cnn_module_kernel (int): Kernel size of convolution module. - - Examples:: - >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) - >>> src = torch.rand(10, 32, 512) - >>> pos_emb = torch.rand(32, 19, 512) - >>> out = encoder_layer(src, pos_emb) - """ - - def __init__( - self, - d_model: int, - nhead: int, - dim_feedforward: int = 2048, - dropout: float = 0.1, - layer_dropout: float = 0.075, - cnn_module_kernel: int = 31, - ) -> None: - super(ConformerEncoderLayer, self).__init__() - - self.layer_dropout = layer_dropout - - self.d_model = d_model - - self.self_attn = RelPositionMultiheadAttention( - d_model, nhead, dropout=0.0 - ) - - self.feed_forward = nn.Sequential( - ScaledLinear(d_model, dim_feedforward), - ActivationBalancer(channel_dim=-1), - DoubleSwish(), - nn.Dropout(dropout), - ScaledLinear(dim_feedforward, d_model, initial_scale=0.25), - ) - - self.feed_forward_macaron = nn.Sequential( - ScaledLinear(d_model, dim_feedforward), - ActivationBalancer(channel_dim=-1), - DoubleSwish(), - nn.Dropout(dropout), - ScaledLinear(dim_feedforward, d_model, initial_scale=0.25), - ) - - self.conv_module = ConvolutionModule(d_model, cnn_module_kernel) - - self.norm_final = BasicNorm(d_model) - - # try to ensure the output is close to zero-mean (or at least, zero-median). - self.balancer = ActivationBalancer( - channel_dim=-1, min_positive=0.45, max_positive=0.55, max_abs=6.0 - ) - - self.dropout = nn.Dropout(dropout) - - def forward( - self, - src: Tensor, - pos_emb: Tensor, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - warmup: float = 1.0, - ) -> Tensor: - """ - Pass the input through the encoder layer. - - Args: - src: the sequence to the encoder layer (required). - pos_emb: Positional embedding tensor (required). - src_mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - warmup: controls selective bypass of of layers; if < 1.0, we will - bypass layers more frequently. - - Shape: - src: (S, N, E). - pos_emb: (N, 2*S-1, E) - src_mask: (S, S). - src_key_padding_mask: (N, S). - S is the source sequence length, N is the batch size, E is the feature number - """ - src_orig = src - - warmup_scale = min(0.1 + warmup, 1.0) - # alpha = 1.0 means fully use this encoder layer, 0.0 would mean - # completely bypass it. - if self.training: - alpha = ( - warmup_scale - if torch.rand(()).item() <= (1.0 - self.layer_dropout) - else 0.1 - ) - else: - alpha = 1.0 - - # macaron style feed forward module - src = src + self.dropout(self.feed_forward_macaron(src)) - - # multi-headed self-attention module - src_att = self.self_attn( - src, - src, - src, - pos_emb=pos_emb, - attn_mask=src_mask, - key_padding_mask=src_key_padding_mask, - )[0] - src = src + self.dropout(src_att) - - # convolution module - src = src + self.dropout(self.conv_module(src)) - - # feed forward module - src = src + self.dropout(self.feed_forward(src)) - - src = self.norm_final(self.balancer(src)) - - if alpha != 1.0: - src = alpha * src + (1 - alpha) * src_orig - - return src - - -class ConformerEncoder(nn.Module): - r"""ConformerEncoder is a stack of N encoder layers - - Args: - encoder_layer: an instance of the ConformerEncoderLayer() class (required). - num_layers: the number of sub-encoder-layers in the encoder (required). - - Examples:: - >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) - >>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6) - >>> src = torch.rand(10, 32, 512) - >>> pos_emb = torch.rand(32, 19, 512) - >>> out = conformer_encoder(src, pos_emb) - """ - - def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None: - super().__init__() - self.layers = nn.ModuleList( - [copy.deepcopy(encoder_layer) for i in range(num_layers)] - ) - self.num_layers = num_layers - - def forward( - self, - src: Tensor, - pos_emb: Tensor, - mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - warmup: float = 1.0, - ) -> Tensor: - r"""Pass the input through the encoder layers in turn. - - Args: - src: the sequence to the encoder (required). - pos_emb: Positional embedding tensor (required). - mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - - Shape: - src: (S, N, E). - pos_emb: (N, 2*S-1, E) - mask: (S, S). - src_key_padding_mask: (N, S). - S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number - - """ - output = src - - for i, mod in enumerate(self.layers): - output = mod( - output, - pos_emb, - src_mask=mask, - src_key_padding_mask=src_key_padding_mask, - warmup=warmup, - ) - - return output - - -class RelPositionalEncoding(torch.nn.Module): - """Relative positional encoding module. - - See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" - Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py - - Args: - d_model: Embedding dimension. - dropout_rate: Dropout rate. - max_len: Maximum input length. - - """ - - def __init__( - self, d_model: int, dropout_rate: float, max_len: int = 5000 - ) -> None: - """Construct an PositionalEncoding object.""" - super(RelPositionalEncoding, self).__init__() - self.d_model = d_model - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.pe = None - self.extend_pe(torch.tensor(0.0).expand(1, max_len)) - - def extend_pe(self, x: Tensor) -> None: - """Reset the positional encodings.""" - if self.pe is not None: - # self.pe contains both positive and negative parts - # the length of self.pe is 2 * input_len - 1 - if self.pe.size(1) >= x.size(1) * 2 - 1: - # Note: TorchScript doesn't implement operator== for torch.Device - if self.pe.dtype != x.dtype or str(self.pe.device) != str( - x.device - ): - self.pe = self.pe.to(dtype=x.dtype, device=x.device) - return - # Suppose `i` means to the position of query vecotr and `j` means the - # position of key vector. We use position relative positions when keys - # are to the left (i>j) and negative relative positions otherwise (i Tuple[Tensor, Tensor]: - """Add positional encoding. - - Args: - x (torch.Tensor): Input tensor (batch, time, `*`). - - Returns: - torch.Tensor: Encoded tensor (batch, time, `*`). - torch.Tensor: Encoded tensor (batch, 2*time-1, `*`). - - """ - self.extend_pe(x) - pos_emb = self.pe[ - :, - self.pe.size(1) // 2 - - x.size(1) - + 1 : self.pe.size(1) // 2 # noqa E203 - + x.size(1), - ] - return self.dropout(x), self.dropout(pos_emb) - - -class RelPositionMultiheadAttention(nn.Module): - r"""Multi-Head Attention layer with relative position encoding - - See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" - - Args: - embed_dim: total dimension of the model. - num_heads: parallel attention heads. - dropout: a Dropout layer on attn_output_weights. Default: 0.0. - - Examples:: - - >>> rel_pos_multihead_attn = RelPositionMultiheadAttention(embed_dim, num_heads) - >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb) - """ - - def __init__( - self, - embed_dim: int, - num_heads: int, - dropout: float = 0.0, - ) -> None: - super(RelPositionMultiheadAttention, self).__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" - - self.in_proj = ScaledLinear(embed_dim, 3 * embed_dim, bias=True) - self.out_proj = ScaledLinear( - embed_dim, embed_dim, bias=True, initial_scale=0.25 - ) - - # linear transformation for positional encoding. - self.linear_pos = ScaledLinear(embed_dim, embed_dim, bias=False) - # these two learnable bias are used in matrix c and matrix d - # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3 - self.pos_bias_u = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) - self.pos_bias_v = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) - self.pos_bias_u_scale = nn.Parameter(torch.zeros(()).detach()) - self.pos_bias_v_scale = nn.Parameter(torch.zeros(()).detach()) - self._reset_parameters() - - def _pos_bias_u(self): - return self.pos_bias_u * self.pos_bias_u_scale.exp() - - def _pos_bias_v(self): - return self.pos_bias_v * self.pos_bias_v_scale.exp() - - def _reset_parameters(self) -> None: - nn.init.normal_(self.pos_bias_u, std=0.01) - nn.init.normal_(self.pos_bias_v, std=0.01) - - def forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - pos_emb: Tensor, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - ) -> Tuple[Tensor, Optional[Tensor]]: - r""" - Args: - query, key, value: map a query and a set of key-value pairs to an output. - pos_emb: Positional embedding tensor - key_padding_mask: if provided, specified padding elements in the key will - be ignored by the attention. When given a binary mask and a value is True, - the corresponding value on the attention layer will be ignored. When given - a byte mask and a value is non-zero, the corresponding value on the attention - layer will be ignored - need_weights: output attn_output_weights. - attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all - the batches while a 3D mask allows to specify a different mask for the entries of each batch. - - Shape: - - Inputs: - - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is - the embedding dimension. - - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is - the embedding dimension. - - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is - the embedding dimension. - - pos_emb: :math:`(N, 2*L-1, E)` where L is the target sequence length, N is the batch size, E is - the embedding dimension. - - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. - If a ByteTensor is provided, the non-zero positions will be ignored while the position - with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the - value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. - - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, - S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked - positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend - while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` - is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor - is provided, it will be added to the attention weight. - - - Outputs: - - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, - E is the embedding dimension. - - attn_output_weights: :math:`(N, L, S)` where N is the batch size, - L is the target sequence length, S is the source sequence length. - """ - return self.multi_head_attention_forward( - query, - key, - value, - pos_emb, - self.embed_dim, - self.num_heads, - self.in_proj.get_weight(), - self.in_proj.get_bias(), - self.dropout, - self.out_proj.get_weight(), - self.out_proj.get_bias(), - training=self.training, - key_padding_mask=key_padding_mask, - need_weights=need_weights, - attn_mask=attn_mask, - ) - - def rel_shift(self, x: Tensor) -> Tensor: - """Compute relative positional encoding. - - Args: - x: Input tensor (batch, head, time1, 2*time1-1). - time1 means the length of query vector. - - Returns: - Tensor: tensor of shape (batch, head, time1, time2) - (note: time2 has the same value as time1, but it is for - the key, while time1 is for the query). - """ - (batch_size, num_heads, time1, n) = x.shape - assert n == 2 * time1 - 1 - # Note: TorchScript requires explicit arg for stride() - batch_stride = x.stride(0) - head_stride = x.stride(1) - time1_stride = x.stride(2) - n_stride = x.stride(3) - return x.as_strided( - (batch_size, num_heads, time1, time1), - (batch_stride, head_stride, time1_stride - n_stride, n_stride), - storage_offset=n_stride * (time1 - 1), - ) - - def multi_head_attention_forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - pos_emb: Tensor, - embed_dim_to_check: int, - num_heads: int, - in_proj_weight: Tensor, - in_proj_bias: Tensor, - dropout_p: float, - out_proj_weight: Tensor, - out_proj_bias: Tensor, - training: bool = True, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - ) -> Tuple[Tensor, Optional[Tensor]]: - r""" - Args: - query, key, value: map a query and a set of key-value pairs to an output. - pos_emb: Positional embedding tensor - embed_dim_to_check: total dimension of the model. - num_heads: parallel attention heads. - in_proj_weight, in_proj_bias: input projection weight and bias. - dropout_p: probability of an element to be zeroed. - out_proj_weight, out_proj_bias: the output projection weight and bias. - training: apply dropout if is ``True``. - key_padding_mask: if provided, specified padding elements in the key will - be ignored by the attention. This is an binary mask. When the value is True, - the corresponding value on the attention layer will be filled with -inf. - need_weights: output attn_output_weights. - attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all - the batches while a 3D mask allows to specify a different mask for the entries of each batch. - - Shape: - Inputs: - - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is - the embedding dimension. - - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is - the embedding dimension. - - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is - the embedding dimension. - - pos_emb: :math:`(N, 2*L-1, E)` or :math:`(1, 2*L-1, E)` where L is the target sequence - length, N is the batch size, E is the embedding dimension. - - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. - If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions - will be unchanged. If a BoolTensor is provided, the positions with the - value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. - - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, - S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked - positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend - while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` - are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor - is provided, it will be added to the attention weight. - - Outputs: - - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, - E is the embedding dimension. - - attn_output_weights: :math:`(N, L, S)` where N is the batch size, - L is the target sequence length, S is the source sequence length. - """ - - tgt_len, bsz, embed_dim = query.size() - assert embed_dim == embed_dim_to_check - assert key.size(0) == value.size(0) and key.size(1) == value.size(1) - - head_dim = embed_dim // num_heads - assert ( - head_dim * num_heads == embed_dim - ), "embed_dim must be divisible by num_heads" - - scaling = float(head_dim) ** -0.5 - - if torch.equal(query, key) and torch.equal(key, value): - # self-attention - q, k, v = nn.functional.linear( - query, in_proj_weight, in_proj_bias - ).chunk(3, dim=-1) - - elif torch.equal(key, value): - # encoder-decoder attention - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = 0 - _end = embed_dim - _w = in_proj_weight[_start:_end, :] - if _b is not None: - _b = _b[_start:_end] - q = nn.functional.linear(query, _w, _b) - - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = embed_dim - _end = None - _w = in_proj_weight[_start:, :] - if _b is not None: - _b = _b[_start:] - k, v = nn.functional.linear(key, _w, _b).chunk(2, dim=-1) - - else: - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = 0 - _end = embed_dim - _w = in_proj_weight[_start:_end, :] - if _b is not None: - _b = _b[_start:_end] - q = nn.functional.linear(query, _w, _b) - - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = embed_dim - _end = embed_dim * 2 - _w = in_proj_weight[_start:_end, :] - if _b is not None: - _b = _b[_start:_end] - k = nn.functional.linear(key, _w, _b) - - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = embed_dim * 2 - _end = None - _w = in_proj_weight[_start:, :] - if _b is not None: - _b = _b[_start:] - v = nn.functional.linear(value, _w, _b) - - if attn_mask is not None: - assert ( - attn_mask.dtype == torch.float32 - or attn_mask.dtype == torch.float64 - or attn_mask.dtype == torch.float16 - or attn_mask.dtype == torch.uint8 - or attn_mask.dtype == torch.bool - ), "Only float, byte, and bool types are supported for attn_mask, not {}".format( - attn_mask.dtype - ) - if attn_mask.dtype == torch.uint8: - warnings.warn( - "Byte tensor for attn_mask is deprecated. Use bool tensor instead." - ) - attn_mask = attn_mask.to(torch.bool) - - if attn_mask.dim() == 2: - attn_mask = attn_mask.unsqueeze(0) - if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: - raise RuntimeError( - "The size of the 2D attn_mask is not correct." - ) - elif attn_mask.dim() == 3: - if list(attn_mask.size()) != [ - bsz * num_heads, - query.size(0), - key.size(0), - ]: - raise RuntimeError( - "The size of the 3D attn_mask is not correct." - ) - else: - raise RuntimeError( - "attn_mask's dimension {} is not supported".format( - attn_mask.dim() - ) - ) - # attn_mask's dim is 3 now. - - # convert ByteTensor key_padding_mask to bool - if ( - key_padding_mask is not None - and key_padding_mask.dtype == torch.uint8 - ): - warnings.warn( - "Byte tensor for key_padding_mask is deprecated. Use bool tensor instead." - ) - key_padding_mask = key_padding_mask.to(torch.bool) - - q = (q * scaling).contiguous().view(tgt_len, bsz, num_heads, head_dim) - k = k.contiguous().view(-1, bsz, num_heads, head_dim) - v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) - - src_len = k.size(0) - - if key_padding_mask is not None: - assert key_padding_mask.size(0) == bsz, "{} == {}".format( - key_padding_mask.size(0), bsz - ) - assert key_padding_mask.size(1) == src_len, "{} == {}".format( - key_padding_mask.size(1), src_len - ) - - q = q.transpose(0, 1) # (batch, time1, head, d_k) - - pos_emb_bsz = pos_emb.size(0) - assert pos_emb_bsz in (1, bsz) # actually it is 1 - p = self.linear_pos(pos_emb).view(pos_emb_bsz, -1, num_heads, head_dim) - p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) - - q_with_bias_u = (q + self._pos_bias_u()).transpose( - 1, 2 - ) # (batch, head, time1, d_k) - - q_with_bias_v = (q + self._pos_bias_v()).transpose( - 1, 2 - ) # (batch, head, time1, d_k) - - # compute attention score - # first compute matrix a and matrix c - # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3 - k = k.permute(1, 2, 3, 0) # (batch, head, d_k, time2) - matrix_ac = torch.matmul( - q_with_bias_u, k - ) # (batch, head, time1, time2) - - # compute matrix b and matrix d - matrix_bd = torch.matmul( - q_with_bias_v, p.transpose(-2, -1) - ) # (batch, head, time1, 2*time1-1) - matrix_bd = self.rel_shift(matrix_bd) - - attn_output_weights = ( - matrix_ac + matrix_bd - ) # (batch, head, time1, time2) - - attn_output_weights = attn_output_weights.view( - bsz * num_heads, tgt_len, -1 - ) - - assert list(attn_output_weights.size()) == [ - bsz * num_heads, - tgt_len, - src_len, - ] - - if attn_mask is not None: - if attn_mask.dtype == torch.bool: - attn_output_weights.masked_fill_(attn_mask, float("-inf")) - else: - attn_output_weights += attn_mask - - if key_padding_mask is not None: - attn_output_weights = attn_output_weights.view( - bsz, num_heads, tgt_len, src_len - ) - attn_output_weights = attn_output_weights.masked_fill( - key_padding_mask.unsqueeze(1).unsqueeze(2), - float("-inf"), - ) - attn_output_weights = attn_output_weights.view( - bsz * num_heads, tgt_len, src_len - ) - - attn_output_weights = nn.functional.softmax(attn_output_weights, dim=-1) - attn_output_weights = nn.functional.dropout( - attn_output_weights, p=dropout_p, training=training - ) - - attn_output = torch.bmm(attn_output_weights, v) - assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] - attn_output = ( - attn_output.transpose(0, 1) - .contiguous() - .view(tgt_len, bsz, embed_dim) - ) - attn_output = nn.functional.linear( - attn_output, out_proj_weight, out_proj_bias - ) - - if need_weights: - # average attention weights over heads - attn_output_weights = attn_output_weights.view( - bsz, num_heads, tgt_len, src_len - ) - return attn_output, attn_output_weights.sum(dim=1) / num_heads - else: - return attn_output, None - - -class ConvolutionModule(nn.Module): - """ConvolutionModule in Conformer model. - Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py - - Args: - channels (int): The number of channels of conv layers. - kernel_size (int): Kernerl size of conv layers. - bias (bool): Whether to use bias in conv layers (default=True). - - """ - - def __init__( - self, channels: int, kernel_size: int, bias: bool = True - ) -> None: - """Construct an ConvolutionModule object.""" - super(ConvolutionModule, self).__init__() - # kernerl_size should be a odd number for 'SAME' padding - assert (kernel_size - 1) % 2 == 0 - - self.pointwise_conv1 = ScaledConv1d( - channels, - 2 * channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - - # after pointwise_conv1 we put x through a gated linear unit (nn.functional.glu). - # For most layers the normal rms value of channels of x seems to be in the range 1 to 4, - # but sometimes, for some reason, for layer 0 the rms ends up being very large, - # between 50 and 100 for different channels. This will cause very peaky and - # sparse derivatives for the sigmoid gating function, which will tend to make - # the loss function not learn effectively. (for most layers the average absolute values - # are in the range 0.5..9.0, and the average p(x>0), i.e. positive proportion, - # at the output of pointwise_conv1.output is around 0.35 to 0.45 for different - # layers, which likely breaks down as 0.5 for the "linear" half and - # 0.2 to 0.3 for the part that goes into the sigmoid. The idea is that if we - # constrain the rms values to a reasonable range via a constraint of max_abs=10.0, - # it will be in a better position to start learning something, i.e. to latch onto - # the correct range. - self.deriv_balancer1 = ActivationBalancer( - channel_dim=1, max_abs=10.0, min_positive=0.05, max_positive=1.0 - ) - - self.depthwise_conv = ScaledConv1d( - channels, - channels, - kernel_size, - stride=1, - padding=(kernel_size - 1) // 2, - groups=channels, - bias=bias, - ) - - self.deriv_balancer2 = ActivationBalancer( - channel_dim=1, min_positive=0.05, max_positive=1.0 - ) - - self.activation = DoubleSwish() - - self.pointwise_conv2 = ScaledConv1d( - channels, - channels, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - initial_scale=0.25, - ) - - def forward(self, x: Tensor) -> Tensor: - """Compute convolution module. - - Args: - x: Input tensor (#time, batch, channels). - - Returns: - Tensor: Output tensor (#time, batch, channels). - - """ - # exchange the temporal dimension and the feature dimension - x = x.permute(1, 2, 0) # (#batch, channels, time). - - # GLU mechanism - x = self.pointwise_conv1(x) # (batch, 2*channels, time) - - x = self.deriv_balancer1(x) - x = nn.functional.glu(x, dim=1) # (batch, channels, time) - - # 1D Depthwise Conv - x = self.depthwise_conv(x) - - x = self.deriv_balancer2(x) - x = self.activation(x) - - x = self.pointwise_conv2(x) # (batch, channel, time) - - return x.permute(2, 0, 1) - - -class Conv2dSubsampling(nn.Module): - """Convolutional 2D subsampling (to 1/4 length). - - Convert an input of shape (N, T, idim) to an output - with shape (N, T', odim), where - T' = ((T-1)//2 - 1)//2, which approximates T' == T//4 - - It is based on - https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py # noqa - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - layer1_channels: int = 8, - layer2_channels: int = 32, - layer3_channels: int = 128, - ) -> None: - """ - Args: - in_channels: - Number of channels in. The input shape is (N, T, in_channels). - Caution: It requires: T >=7, in_channels >=7 - out_channels - Output dim. The output shape is (N, ((T-1)//2 - 1)//2, out_channels) - layer1_channels: - Number of channels in layer1 - layer1_channels: - Number of channels in layer2 - """ - assert in_channels >= 7 - super().__init__() - - self.conv = nn.Sequential( - ScaledConv2d( - in_channels=1, - out_channels=layer1_channels, - kernel_size=3, - padding=1, - ), - ActivationBalancer(channel_dim=1), - DoubleSwish(), - ScaledConv2d( - in_channels=layer1_channels, - out_channels=layer2_channels, - kernel_size=3, - stride=2, - ), - ActivationBalancer(channel_dim=1), - DoubleSwish(), - ScaledConv2d( - in_channels=layer2_channels, - out_channels=layer3_channels, - kernel_size=3, - stride=2, - ), - ActivationBalancer(channel_dim=1), - DoubleSwish(), - ) - self.out = ScaledLinear( - layer3_channels * (((in_channels - 1) // 2 - 1) // 2), out_channels - ) - # set learn_eps=False because out_norm is preceded by `out`, and `out` - # itself has learned scale, so the extra degree of freedom is not - # needed. - self.out_norm = BasicNorm(out_channels, learn_eps=False) - # constrain median of output to be close to zero. - self.out_balancer = ActivationBalancer( - channel_dim=-1, min_positive=0.45, max_positive=0.55 - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """Subsample x. - - Args: - x: - Its shape is (N, T, idim). - - Returns: - Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) - """ - # On entry, x is (N, T, idim) - x = x.unsqueeze(1) # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W) - x = self.conv(x) - # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2) - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) - # Now x is of shape (N, ((T-1)//2 - 1))//2, odim) - x = self.out_norm(x) - x = self.out_balancer(x) - return x - - -if __name__ == "__main__": - feature_dim = 50 - c = Conformer(num_features=feature_dim, d_model=128, nhead=4) - batch_size = 5 - seq_len = 20 - # Just make sure the forward pass runs. - f = c( - torch.randn(batch_size, seq_len, feature_dim), - torch.full((batch_size,), seq_len, dtype=torch.int64), - warmup=0.5, - ) diff --git a/triton/scripts/export_jit.py b/triton/scripts/export_jit.py deleted file mode 100755 index 95329a654..000000000 --- a/triton/scripts/export_jit.py +++ /dev/null @@ -1,422 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script converts a pretrained transducer model into three jit models: -# encoder_jit.pt decoder_jit.pt joiner_jit.pt -""" -Usage: -./pruned_transducer_stateless3/export_jit.py \ - --pretrained-model ./pruned_transducer_stateless3/exp \ - --output-dir ./pruned_transducer_stateless3/exp \ - --bpe-model ./bpe.model - -It will generate three jit files under ouput_dir, then you should put these -models under corresponding trtion mdoel_repo modules. - -""" - -import argparse -import logging -import os - -import sentencepiece as spm -import torch -from torch import nn - -import torch.nn.functional as F -from scaling import ScaledConv1d, ScaledEmbedding, ScaledLinear - -from conformer_triton import Conformer - -class Decoder(nn.Module): - """This class modifies the stateless decoder from the following paper: - - RNN-transducer with stateless prediction network - https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419 - - It removes the recurrent connection from the decoder, i.e., the prediction - network. Different from the above paper, it adds an extra Conv1d - right after the embedding layer. - - TODO: Implement https://arxiv.org/pdf/2109.07513.pdf - """ - - def __init__( - self, - vocab_size: int, - decoder_dim: int, - blank_id: int, - context_size: int, - ): - """ - Args: - vocab_size: - Number of tokens of the modeling unit including blank. - decoder_dim: - Dimension of the input embedding, and of the decoder output. - blank_id: - The ID of the blank symbol. - context_size: - Number of previous words to use to predict the next word. - 1 means bigram; 2 means trigram. n means (n+1)-gram. - """ - super().__init__() - - self.embedding = ScaledEmbedding( - num_embeddings=vocab_size, - embedding_dim=decoder_dim, - padding_idx=blank_id, - ) - self.blank_id = blank_id - - assert context_size >= 1, context_size - self.context_size = context_size - self.vocab_size = vocab_size - if context_size > 1: - self.conv = ScaledConv1d( - in_channels=decoder_dim, - out_channels=decoder_dim, - kernel_size=context_size, - padding=0, - groups=decoder_dim, - bias=False, - ) - - def forward(self, y: torch.Tensor) -> torch.Tensor: - """ - Args: - y: - A 2-D tensor of shape (N, U). - need_pad: - True to left pad the input. Should be True during training. - False to not pad the input. Should be False during inference. - Returns: - Return a tensor of shape (N, U, decoder_dim). - """ - y = y.to(torch.int64) - embedding_out = self.embedding(y) - if self.context_size > 1: - embedding_out = embedding_out.permute(0, 2, 1) - - # During inference time, there is no need to do extra padding - # as we only need one output - assert embedding_out.size(-1) == self.context_size - embedding_out = self.conv(embedding_out) - embedding_out = embedding_out.permute(0, 2, 1) - embedding_out = F.relu(embedding_out) - return embedding_out - -class Joiner(nn.Module): - def __init__( - self, - encoder_dim: int, - decoder_dim: int, - joiner_dim: int, - vocab_size: int, - ): - super().__init__() - - self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim) - self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim) - self.output_linear = ScaledLinear(joiner_dim, vocab_size) - - def forward( - self, - encoder_out: torch.Tensor, - decoder_out: torch.Tensor, - ) -> torch.Tensor: - """ - Args: - encoder_out: - Output from the encoder. Its shape is (N, T, s_range, C). - decoder_out: - Output from the decoder. Its shape is (N, T, s_range, C). - project_input: - If true, apply input projections encoder_proj and decoder_proj. - If this is false, it is the user's responsibility to do this - manually. - Returns: - Return a tensor of shape (N, T, s_range, C). - """ - - assert encoder_out.ndim == decoder_out.ndim == 4 - assert encoder_out.shape[:-1] == decoder_out.shape[:-1] - - - logit = self.encoder_proj(encoder_out) + self.decoder_proj( - decoder_out - ) - - - logit = self.output_linear(torch.tanh(logit)) - - return logit - -class AttributeDict(dict): - def __getattr__(self, key): - if key in self: - return self[key] - raise AttributeError(f"No such attribute '{key}'") - - def __setattr__(self, key, value): - self[key] = value - - def __delattr__(self, key): - if key in self: - del self[key] - return - raise AttributeError(f"No such attribute '{key}'") - -def str2bool(v): - """Used in argparse.ArgumentParser.add_argument to indicate - that a type is a bool type and user can enter - - - yes, true, t, y, 1, to represent True - - no, false, f, n, 0, to represent False - - See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa - """ - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "f", "n", "0"): - return False - else: - raise argparse.ArgumentTypeError("Boolean value expected.") - -def get_params() -> AttributeDict: - """Return a dict containing training parameters. - - All training related parameters that are not passed from the commandline - are saved in the variable `params`. - - Commandline options are merged into `params` after they are parsed, so - you can also access them via `params`. - - Explanation of options saved in `params`: - - - best_train_loss: Best training loss so far. It is used to select - the model that has the lowest training loss. It is - updated during the training. - - - best_valid_loss: Best validation loss so far. It is used to select - the model that has the lowest validation loss. It is - updated during the training. - - - best_train_epoch: It is the epoch that has the best training loss. - - - best_valid_epoch: It is the epoch that has the best validation loss. - - - batch_idx_train: Used to writing statistics to tensorboard. It - contains number of batches trained so far across - epochs. - - - log_interval: Print training loss if batch_idx % log_interval` is 0 - - - reset_interval: Reset statistics if batch_idx % reset_interval is 0 - - - valid_interval: Run validation if batch_idx % valid_interval is 0 - - - feature_dim: The model input dim. It has to match the one used - in computing features. - - - subsampling_factor: The subsampling factor for the model. - - - encoder_dim: Hidden dim for multi-head attention model. - - - num_decoder_layers: Number of decoder layer of transformer decoder. - - - warm_step: The warm_step for Noam optimizer. - """ - params = AttributeDict( - { - "best_train_loss": float("inf"), - "best_valid_loss": float("inf"), - "best_train_epoch": -1, - "best_valid_epoch": -1, - "batch_idx_train": 0, - "log_interval": 50, - "reset_interval": 200, - "valid_interval": 3000, # For the 100h subset, use 800 - # parameters for conformer - "feature_dim": 80, - "subsampling_factor": 4, - "encoder_dim": 512, - "nhead": 8, - "dim_feedforward": 2048, - "num_encoder_layers": 12, - # parameters for decoder - "decoder_dim": 512, - # parameters for joiner - "joiner_dim": 512, - # parameters for Noam - "model_warm_step": 3000, # arg given to model, not for lrate - "env_info": {}, # remove k2 etc dependency - } - ) - - return params - -def get_encoder_model(params: AttributeDict) -> nn.Module: - # TODO: We can add an option to switch between Conformer and Transformer - encoder = Conformer( - num_features=params.feature_dim, - subsampling_factor=params.subsampling_factor, - d_model=params.encoder_dim, - nhead=params.nhead, - dim_feedforward=params.dim_feedforward, - num_encoder_layers=params.num_encoder_layers, - ) - return encoder - -def get_decoder_model(params: AttributeDict) -> nn.Module: - decoder = Decoder( - vocab_size=params.vocab_size, - decoder_dim=params.decoder_dim, - blank_id=params.blank_id, - context_size=params.context_size, - ) - return decoder - -def get_joiner_model(params: AttributeDict) -> nn.Module: - joiner = Joiner( - encoder_dim=params.encoder_dim, - decoder_dim=params.decoder_dim, - joiner_dim=params.joiner_dim, - vocab_size=params.vocab_size, - ) - return joiner - -def get_parser(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument( - "--pretrained-model", - type=str, - required=True, - help="""It specifies the path of input pretrained_transducer.pt torch model. - """, - ) - - parser.add_argument( - "--bpe-model", - type=str, - default="data/lang_bpe_500/bpe.model", - help="Path to the BPE model", - ) - - parser.add_argument( - "--output-dir", - type=str, - default="exp", - help="""It specifies the directory where output jit models located - """, - ) - - parser.add_argument( - "--context-size", - type=int, - default=2, - help="The context size in the decoder. 1 means bigram; " - "2 means tri-gram", - ) - - return parser - - -def main(): - args = get_parser().parse_args() - - params = get_params() - params.update(vars(args)) - - assert torch.cuda.is_available() - # for cpu models, you need to modify the config.pbtxt files under model_repo - device = torch.device("cuda", 0) - - logging.info(f"device: {device}") - - sp = spm.SentencePieceProcessor() - sp.load(params.bpe_model) - - # is defined in local/train_bpe_model.py - params.blank_id = sp.piece_to_id("") - params.vocab_size = sp.get_piece_size() - - logging.info(params) - - logging.info("About to create models") - - encoder = get_encoder_model(params) - decoder = get_decoder_model(params) - joiner = get_joiner_model(params) - - checkpoint = torch.load(args.pretrained_model, map_location="cpu") - - # remove the prefix, e.g. encoder.encoder.layre1.bias --> encoder.layer1.bias - for old_key in list(checkpoint["model"].keys()): - key_list = old_key.split(".")[1:] - if len(key_list) > 1: - new_key = ".".join(key_list) - else: - new_key = old_key - checkpoint["model"][new_key] = checkpoint["model"].pop(old_key) - - miss_keys,_ = encoder.load_state_dict(checkpoint["model"], strict=False) - assert len(miss_keys) == 0 - miss_keys,_ = decoder.load_state_dict(checkpoint["model"], strict=False) - assert len(miss_keys) == 0 - miss_keys,_ = joiner.load_state_dict(checkpoint["model"], strict=False) - assert len(miss_keys) == 0 - - encoder.cuda() - encoder.eval() - - decoder.cuda() - decoder.eval() - - joiner.cuda() - joiner.eval() - - - os.makedirs(args.output_dir, exist_ok=True) - encoder = torch.jit.script(encoder) - filename = args.output_dir + "/encoder_jit.pt" - encoder.save(filename) - logging.info("Export encoder jit finished.") - - decoder = torch.jit.script(decoder) - filename = args.output_dir + "/decoder_jit.pt" - decoder.save(filename) - logging.info("Export decoder jit finished.") - - joiner = torch.jit.script(joiner) - filename = args.output_dir + "/joiner_jit.pt" - joiner.save(filename) - logging.info("Export joiner jit finished.") - - -if __name__ == "__main__": - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - logging.basicConfig(format=formatter, level=logging.INFO) - main() diff --git a/triton/scripts/export_onnx.py b/triton/scripts/export_onnx.py new file mode 100755 index 000000000..f85a9199e --- /dev/null +++ b/triton/scripts/export_onnx.py @@ -0,0 +1,657 @@ +#!/usr/bin/env python3 +# +# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang) +# 2022 Nvidia (Author: Yuekai Zhang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script converts several saved checkpoints +# to a single one using model averaging. +""" + +Usage: + +(1) Export to ONNX format with streaming ASR model +mv export_onnx.py /pruned_transducer_stateless3/ +mv onnx_triton_utils.py /pruned_transducer_stateless3/ +./pruned_transducer_stateless3/export_onnx.py \ + --exp-dir ./icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/exp \ + --tokenizer-file ./icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/data/lang_bpe_500/bpe.model \ + --epoch 999 \ + --avg 1 \ + --streaming-model 1\ + --causal-convolution 1 \ + --onnx 1 \ + --left-context 64 \ + --right-context 4 \ + --fp16 + +(2) Export to ONNX format with offline ASR model +./pruned_transducer_stateless3/export_onnx.py \ + --exp-dir ./icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/exp \ + --tokenizer-file ./icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/data/lang_bpe_500/bpe.model \ + --epoch 999 \ + --avg 1 \ + --onnx 1 \ + --fp16 + +(3) Export to ONNX format with streaming Chinese ASR model +pretrained_model_dir=./icefall_asr_wenetspeech_pruned_transducer_stateless5_streaming +./pruned_transducer_stateless5/export_onnx.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --tokenizer-file ${pretrained_model_dir}/data/lang_char \ + --epoch 999 \ + --avg 1 \ + --streaming-model 1\ + --causal-convolution 1 \ + --onnx 1 \ + --left-context 64 \ + --right-context 4 \ + --fp16 + +(4) Export to ONNX format with offline Chinese ASR model +pretrained_model_dir=./icefall_asr_wenetspeech_pruned_transducer_stateless5_offline +./pruned_transducer_stateless5/export_onnx.py \ + --exp-dir ${pretrained_model_dir}/exp \ + --tokenizer-file ${pretrained_model_dir}/data/lang_char \ + --epoch 999 \ + --avg 1 \ + --onnx 1 \ + --fp16 + +It will generate the following six files in the given `exp_dir`. + + - encoder.onnx + - decoder.onnx + - joiner.onnx + - encoder_fp16.onnx + - decoder_fp16.onnx + - joiner_fp16.onnx + +Note: If you don't want to train a model from scratch, we have +provided one for you. You can get it at + +https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625 + +with the following commands: + + sudo apt-get install git-lfs + git lfs install + git clone https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625 + # You will find the pre-trained model in icefall_librispeech_streaming_pruned_transducer_stateless3_giga_0.9_20220625/exp + +For Chinese WenetSpeech pretrained model: + +git lfs install +git clone https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless5_offline +""" + +import argparse +import logging +from pathlib import Path + +import onnx +import onnxruntime +import sentencepiece as spm +import torch +import torch.nn as nn + +from onnx_triton_utils import StreamingEncoder, OfflineEncoder, get_transducer_model +from scaling_converter import convert_scaled_to_non_scaled +from train import add_model_arguments, get_params + +from icefall.checkpoint import ( + average_checkpoints, + find_checkpoints, + load_checkpoint, +) +from icefall.utils import str2bool +from icefall.lexicon import Lexicon + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--epoch", + type=int, + default=28, + help="""It specifies the checkpoint to use for averaging. + Note: Epoch counts from 0. + You can specify --avg to use more checkpoints for model averaging.""", + ) + + parser.add_argument( + "--iter", + type=int, + default=0, + help="""If positive, --epoch is ignored and it + will use the checkpoint exp_dir/checkpoint-iter.pt. + You can specify --avg to use more checkpoints for model averaging. + """, + ) + + parser.add_argument( + "--avg", + type=int, + default=15, + help="Number of checkpoints to average. Automatically select " + "consecutive checkpoints before the checkpoint specified by " + "'--epoch' and '--iter'", + ) + + parser.add_argument( + "--exp-dir", + type=str, + default="pruned_transducer_stateless3/exp", + help="""It specifies the directory where all training related + files, e.g., checkpoints, log, etc, are saved + """, + ) + + parser.add_argument( + "--tokenizer-file", + type=str, + default="data/lang_bpe_500/bpe.model", + help="Path to the BPE model or char dict file", + ) + + parser.add_argument( + "--onnx", + type=str2bool, + default=False, + help="""If True, --jit is ignored and it exports the model + to onnx format. Three files will be generated: + + - encoder.onnx + - decoder.onnx + - joiner.onnx + + Check ./onnx_check.py and ./onnx_pretrained.py for how to use them. + """, + ) + + parser.add_argument( + "--context-size", + type=int, + default=2, + help="The context size in the decoder. 1 means bigram; " + "2 means tri-gram", + ) + + parser.add_argument( + "--left-context", + type=int, + default=64, + help="The left context for streaming encoder", + ) + + parser.add_argument( + "--right-context", + type=int, + default=4, + help="The right context for streaming encoder", + ) + + parser.add_argument( + "--streaming-model", + type=str2bool, + default=False, + help="""Whether to export a streaming model, if the models in exp-dir + are streaming model, this should be True. + """, + ) + + parser.add_argument('--fp16', + action='store_true', + help='whether to export fp16 model, default false') + + add_model_arguments(parser) + + return parser + +def to_numpy(tensors): + out = [] + if type(tensors) == torch.tensor: + tensors = [tensors] + for tensor in tensors: + if tensor.requires_grad: + tensor = tensor.detach().cpu().numpy() + else: + tensor = tensor.cpu().numpy() + out.append(tensor) + return out + +def export_encoder_model_onnx_streaming( + encoder_model: nn.Module, + encoder_filename: str, + opset_version: int = 11, + left_context: int = 64, + right_context: int = 4, + chunk_size: int = 16, + warmup: float = 1.0, +) -> None: + """Export the given encoder model to ONNX format. + The exported model has two inputs: + + - x, a tensor of shape (N, T, C); dtype is torch.float32 + - x_lens, a tensor of shape (N,); dtype is torch.int64 + + and it has two outputs: + + - encoder_out, a tensor of shape (N, T, C) + - encoder_out_lens, a tensor of shape (N,) + + Note: The warmup argument is fixed to 1. + + Args: + encoder_model: + The input encoder model + encoder_filename: + The filename to save the exported ONNX model. + opset_version: + The opset version to use. + """ + encoder_model = StreamingEncoder( + encoder_model, left_context, right_context, chunk_size, warmup + ) + encoder_model.eval() + x = torch.zeros(2, 51, 80, dtype=torch.float32) + x_lens = torch.tensor([51,42], dtype=torch.int64) #TODO FIX int32 + states = [ + torch.zeros( + 2, + encoder_model.left_context, + encoder_model.encoder_layers, + encoder_model.d_model, + ), + torch.zeros( + 2, + encoder_model.cnn_module_kernel - 1, + encoder_model.encoder_layers, + encoder_model.d_model, + ), + ] + + attn_cache, cnn_cache = states[0], states[1] + + processed_lens = torch.tensor([0,0], dtype=torch.int64) + + processed_lens = processed_lens.unsqueeze(-1) + + # encoder_model = torch.jit.script(encoder_model) + # It throws the following error for the above statement + # + # RuntimeError: Exporting the operator __is_ to ONNX opset version + # 11 is not supported. Please feel free to request support or + # submit a pull request on PyTorch GitHub. + # + # I cannot find which statement causes the above error. + # torch.onnx.export() will use torch.jit.trace() internally, which + # works well for the current reworked model + + torch.onnx.export( + encoder_model, + (x, x_lens, attn_cache, cnn_cache, processed_lens), + encoder_filename, + verbose=False, + opset_version=opset_version, + input_names=[ + "x", + "x_lens", + "attn_cache", + "cnn_cache", + "processed_lens", + ], + output_names=[ + "encoder_out", + "encoder_out_lens", + "next_attn_cache", + "next_cnn_cache", + "next_processed_lens", + ], + dynamic_axes={ + "x": {0: "B", 1: "T"}, + "x_lens": {0: "B"}, + "attn_cache": {0: "B"}, + "cnn_cache": {0: "B"}, + "processed_lens": {0: "B"}, + "encoder_out": {0: "B", 1: "T"}, + "encoder_out_lens": {0: "B"}, + "next_attn_cache": {0: "B"}, + "next_cnn_cache": {0: "B"}, + "next_processed_lens": {0: "B"}, + }, + ) + + with torch.no_grad(): + o0, o1, o2, o3, o4 = encoder_model(x, x_lens, attn_cache, cnn_cache, processed_lens) + print(o4.shape,o4) + + providers = ["CUDAExecutionProvider"] + ort_session = onnxruntime.InferenceSession(str(encoder_filename), + providers=providers) + ort_inputs = {'x': to_numpy(x), + 'x_lens': to_numpy(x_lens), + 'attn_cache': to_numpy(attn_cache), + 'cnn_cache': to_numpy(cnn_cache), + 'processed_lens': to_numpy(processed_lens)} + ort_outs = ort_session.run(None, ort_inputs) + + logging.info(f"Saved to {encoder_filename}") + + +def export_encoder_model_onnx_triton( + encoder_model: nn.Module, + encoder_filename: str, + opset_version: int = 11, +) -> None: + """Export the given encoder model to ONNX format. + The exported model has two inputs: + + - x, a tensor of shape (N, T, C); dtype is torch.float32 + - x_lens, a tensor of shape (N,); dtype is torch.int64 + + and it has two outputs: + + - encoder_out, a tensor of shape (N, T, C) + - encoder_out_lens, a tensor of shape (N,) + + Args: + encoder_model: + The input encoder model + encoder_filename: + The filename to save the exported ONNX model. + opset_version: + The opset version to use. + """ + encoder_model = OfflineEncoder(encoder_model) + encoder_model.eval() + x = torch.zeros(1, 51, 80, dtype=torch.float32) + x_lens = torch.tensor([51], dtype=torch.int64) #TODO FIX int32 + + torch.onnx.export( + encoder_model, + (x, x_lens), + encoder_filename, + verbose=False, + opset_version=opset_version, + input_names=[ + "speech", + "speech_lengths", + ], + output_names=[ + "encoder_out", + "encoder_out_lens", + ], + dynamic_axes={ + "speech": {0: "B", 1: "T"}, + "speech_lengths": {0: "B"}, + "encoder_out": {0: "B", 1: "T"}, + "encoder_out_lens": {0: "B"}, + }, + ) + + logging.info(f"Saved to {encoder_filename}") + +def export_decoder_model_onnx_triton( + decoder_model: nn.Module, + decoder_filename: str, + opset_version: int = 11, +) -> None: + """Export the decoder model to ONNX format. + + The exported model has one input: + + - y: a torch.int64 tensor of shape (N, decoder_model.context_size) + + and has one output: + + - decoder_out: a torch.float32 tensor of shape (N, 1, C) + + Note: The argument need_pad is fixed to False. + + Args: + decoder_model: + The decoder model to be exported. + decoder_filename: + Filename to save the exported ONNX model. + opset_version: + The opset version to use. + """ + y = torch.zeros(10, decoder_model.context_size, dtype=torch.int64) + + decoder_model.eval() + + # Note(fangjun): torch.jit.trace() is more efficient than torch.jit.script() + # in this case + torch.onnx.export( + decoder_model, + (y,), + decoder_filename, + verbose=False, + opset_version=opset_version, + input_names=["y"], + output_names=["decoder_out"], + dynamic_axes={ + "y": {0: "N"}, + "decoder_out": {0: "N"}, + }, + ) + logging.info(f"Saved to {decoder_filename}") + + +def export_joiner_model_onnx_triton( + joiner_model: nn.Module, + joiner_filename: str, + opset_version: int = 11, +) -> None: + """Export the joiner model to ONNX format. + The exported model has two inputs: + + - encoder_out: a tensor of shape (N, encoder_out_dim) + - decoder_out: a tensor of shape (N, decoder_out_dim) + + and has one output: + + - joiner_out: a tensor of shape (N, vocab_size) + + Note: The argument project_input is fixed to True. A user should not + project the encoder_out/decoder_out by himself/herself. The exported joiner + will do that for the user. + """ + encoder_out_dim = joiner_model.encoder_proj.weight.shape[1] + decoder_out_dim = joiner_model.decoder_proj.weight.shape[1] + encoder_out = torch.rand(1, encoder_out_dim, dtype=torch.float32) + decoder_out = torch.rand(1, decoder_out_dim, dtype=torch.float32) + + project_input = True + joiner_model.eval() + # Note: It uses torch.jit.trace() internally + torch.onnx.export( + joiner_model, + (encoder_out, decoder_out), + joiner_filename, + verbose=False, + opset_version=opset_version, + input_names=["encoder_out", "decoder_out"], + output_names=["logit"], + dynamic_axes={ + "encoder_out": {0: "N"}, + "decoder_out": {0: "N"}, + "logit": {0: "N"}, + }, + ) + logging.info(f"Saved to {joiner_filename}") + +@torch.no_grad() +def main(): + args = get_parser().parse_args() + args.exp_dir = Path(args.exp_dir) + + params = get_params() + params.update(vars(args)) + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + + logging.info(f"device: {device}") + + if 'bpe' in params.tokenizer_file: + sp = spm.SentencePieceProcessor() + sp.load(params.tokenizer_file) + + # is defined in local/train_bpe_model.py + params.blank_id = sp.piece_to_id("") + params.vocab_size = sp.get_piece_size() + else: + assert 'char' in params.tokenizer_file + lexicon = Lexicon(params.tokenizer_file) + + params.blank_id = lexicon.token_table[""] + params.vocab_size = max(lexicon.tokens) + 1 + + + if params.streaming_model: + assert params.causal_convolution + + logging.info(params) + + logging.info("About to create model") + if params.onnx: + model = get_transducer_model(params) + else: + raise NotImplementedError + + model.to(device) + + if params.iter > 0: + filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[ + : params.avg + ] + if len(filenames) == 0: + raise ValueError( + f"No checkpoints found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + elif len(filenames) < params.avg: + raise ValueError( + f"Not enough checkpoints ({len(filenames)}) found for" + f" --iter {params.iter}, --avg {params.avg}" + ) + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict( + average_checkpoints(filenames, device=device), strict=False + ) + elif params.avg == 1: + load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model) + else: + start = params.epoch - params.avg + 1 + filenames = [] + for i in range(start, params.epoch + 1): + if start >= 0: + filenames.append(f"{params.exp_dir}/epoch-{i}.pt") + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict( + average_checkpoints(filenames, device=device), strict=False + ) + + model.to("cpu") + model.eval() + + if params.onnx is True: + convert_scaled_to_non_scaled(model, inplace=True) + opset_version = 11 + logging.info("Exporting to onnx format") + encoder_filename = params.exp_dir / "encoder.onnx" + if params.streaming_model: + export_encoder_model_onnx_streaming( + model.encoder, + encoder_filename, + opset_version=opset_version, + left_context=params.left_context, + right_context=params.right_context, + ) + else: + export_encoder_model_onnx_triton( + model.encoder, encoder_filename, opset_version=opset_version + ) + + decoder_filename = params.exp_dir / "decoder.onnx" + + export_decoder_model_onnx_triton( + model.decoder, + decoder_filename, + opset_version=opset_version, + ) + + joiner_filename = params.exp_dir / "joiner.onnx" + export_joiner_model_onnx_triton( + model.joiner, + joiner_filename, + opset_version=opset_version, + ) + + cnn_module_kernel = model.encoder.cnn_module_kernel + export_log_filename = params.exp_dir / "onnx_export.log" + with open(export_log_filename, 'w') as log_f: + log_f.write(f"ENCODER_LEFT_CONTEXT: {params.left_context}\n") + log_f.write(f"ENCODER_RIGHT_CONTEXT: {params.right_context}\n") + log_f.write(f"ENCODER_DIM: {params.encoder_dim}\n") + log_f.write(f"DECODER_DIM: {params.decoder_dim}\n") + log_f.write(f"VOCAB_SIZE: {params.vocab_size}\n") + log_f.write(f"DECODER_CONTEXT_SIZE: {params.context_size}\n") + log_f.write(f"CNN_MODULE_KERNEL: {cnn_module_kernel}\n") + log_f.write(f"ENCODER_LAYERS: {params.num_encoder_layers}\n") + log_f.write(f"All params:{params}") + + if params.fp16: + try: + import onnxmltools + from onnxmltools.utils.float16_converter import convert_float_to_float16 + except ImportError: + print('Please install onnxmltools!') + import sys + sys.exit(1) + def export_onnx_fp16(onnx_fp32_path, onnx_fp16_path): + onnx_fp32_model = onnxmltools.utils.load_model(onnx_fp32_path) + onnx_fp16_model = convert_float_to_float16(onnx_fp32_model) + onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path) + encoder_fp16_filename = params.exp_dir / "encoder_fp16.onnx" + export_onnx_fp16(encoder_filename, encoder_fp16_filename) + decoder_fp16_filename = params.exp_dir / "decoder_fp16.onnx" + export_onnx_fp16(decoder_filename, decoder_fp16_filename) + joiner_fp16_filename = params.exp_dir / "joiner_fp16.onnx" + export_onnx_fp16(joiner_filename, joiner_fp16_filename) + else: + logging.info("Not using onnx") + # Save it using a format so that it can be loaded + # by :func:`load_checkpoint` + filename = params.exp_dir / "pretrained.pt" + torch.save({"model": model.state_dict()}, str(filename)) + logging.info(f"Saved to {filename}") + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + main() diff --git a/triton/scripts/onnx_triton_utils.py b/triton/scripts/onnx_triton_utils.py new file mode 100755 index 000000000..5b8465d51 --- /dev/null +++ b/triton/scripts/onnx_triton_utils.py @@ -0,0 +1,459 @@ +import argparse +import logging +import os +from typing import Optional, Tuple + +import sentencepiece as spm +import torch +from torch import nn + +import torch.nn.functional as F +from scaling import ScaledConv1d, ScaledEmbedding, ScaledLinear + +from conformer import Conformer +from model import Transducer + +from icefall.dist import cleanup_dist, setup_dist +from icefall.env import get_env_info +from icefall.utils import ( + AttributeDict, + MetricsTracker, + display_and_save_batch, + setup_logger, + str2bool, +) +from icefall.utils import is_jit_tracing, make_pad_mask + +class StreamingEncoder(torch.nn.Module): + """ + Args: + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + chunk_size: + The chunk size for decoding, this will be used to simulate streaming + decoding using masking. + warmup: + A floating point value that gradually increases from 0 throughout + training; when it is >= 1.0 we are "fully warmed up". It is used + to turn modules on sequentially. + """ + + def __init__(self, model, left_context, right_context, chunk_size, warmup): + super().__init__() + self.encoder = model.encoder + self.encoder_embed = model.encoder_embed + self.encoder_layers = model.encoder_layers + self.d_model = model.d_model + self.cnn_module_kernel = model.cnn_module_kernel + self.encoder_pos = model.encoder_pos + self.left_context = left_context + self.right_context = right_context + self.chunk_size = chunk_size + self.warmup = warmup + + def forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + attn_cache: torch.tensor, + cnn_cache: torch.tensor, + processed_lens: Optional[torch.Tensor] = None, + ) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor + ]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (encoder_layers, left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (encoder_layers, cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + processed_lens: + How many frames (after subsampling) have been processed for each sequence. + + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + - decode_states, the updated states including the information + of current chunk. + """ + + # x: [N, T, C] + # Caution: We assume the subsampling factor is 4! + + # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning + # + # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + attn_cache = attn_cache.transpose(0, 2) + cnn_cache = cnn_cache.transpose(0, 2) + states = [attn_cache, cnn_cache] + assert states is not None + assert processed_lens is not None + assert ( + len(states) == 2 + and states[0].shape + == (self.encoder_layers, self.left_context, x.size(0), self.d_model) + and states[1].shape + == ( + self.encoder_layers, + self.cnn_module_kernel - 1, + x.size(0), + self.d_model, + ) + ), f"""The length of states MUST be equal to 2, and the shape of + first element should be {(self.encoder_layers, self.left_context, x.size(0), self.d_model)}, + given {states[0].shape}. the shape of second element should be + {(self.encoder_layers, self.cnn_module_kernel - 1, x.size(0), self.d_model)}, + given {states[1].shape}.""" + + lengths -= ( + 2 # we will cut off 1 frame on each side of encoder_embed output + ) + + embed = self.encoder_embed(x) + + # cut off 1 frame on each size of embed as they see the padding + # value which causes a training and decoding mismatch. + embed = embed[:, 1:-1, :] + + embed, pos_enc = self.encoder_pos(embed, self.left_context) + embed = embed.permute(1, 0, 2) # (B, T, F) -> (T, B, F) + + src_key_padding_mask = make_pad_mask(lengths, embed.size(0)) + + processed_mask = torch.arange( + self.left_context, device=x.device + ).expand(x.size(0), self.left_context) + + processed_mask = (processed_lens <= processed_mask).flip(1) + + src_key_padding_mask = torch.cat( + [processed_mask, src_key_padding_mask], dim=1 + ) + + x, states = self.encoder.chunk_forward( + embed, + pos_enc, + src_key_padding_mask=src_key_padding_mask, + warmup=self.warmup, + states=states, + left_context=self.left_context, + right_context=self.right_context, + ) # (T, B, F) + if self.right_context > 0: + x = x[: -self.right_context, ...] + lengths -= self.right_context + + x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + processed_lens = processed_lens + lengths.unsqueeze(-1) + assert processed_lens.shape[1] == 1, processed_lens.shape + + return ( + x, + lengths, + states[0].transpose(0, 2), + states[1].transpose(0, 2), + processed_lens, + ) + +class OfflineEncoder(torch.nn.Module): + """ + Args: + model: Conformer Encoder + """ + + def __init__( + self, + model + ) -> None: + super().__init__() + + self.num_features = model.num_features + self.subsampling_factor = model.subsampling_factor + if self.subsampling_factor != 4: + raise NotImplementedError("Support only 'subsampling_factor=4'.") + + # self.encoder_embed converts the input of shape (N, T, num_features) + # to the shape (N, T//subsampling_factor, d_model). + # That is, it does two things simultaneously: + # (1) subsampling: T -> T//subsampling_factor + # (2) embedding: num_features -> d_model + self.encoder_embed = model.encoder_embed + + self.encoder_layers = model.encoder_layers + self.d_model = model.d_model + self.cnn_module_kernel = model.cnn_module_kernel + self.causal = model.causal + self.dynamic_chunk_training = model.dynamic_chunk_training + self.short_chunk_threshold = model.short_chunk_threshold + self.short_chunk_size = model.short_chunk_size + self.num_left_chunks = model.num_left_chunks + + self.encoder_pos = model.encoder_pos + self.encoder = model.encoder + + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + Returns: + Return a tuple containing 2 tensors: + - embeddings: its shape is (batch_size, output_seq_len, d_model) + - lengths, a tensor of shape (batch_size,) containing the number + of frames in `embeddings` before padding. + """ + + # Note warmup is fixed to 1.0. + warmup = 1.0 + x = self.encoder_embed(x) + x, pos_emb = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + # Caution: We assume the subsampling factor is 4! + + # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning + # + # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + + if not is_jit_tracing(): + assert x.size(0) == lengths.max().item() + + src_key_padding_mask = make_pad_mask(lengths, x.size(0)) + + if self.dynamic_chunk_training: + assert ( + self.causal + ), "Causal convolution is required for streaming conformer." + max_len = x.size(0) + chunk_size = torch.randint(1, max_len, (1,)).item() + if chunk_size > (max_len * self.short_chunk_threshold): + chunk_size = max_len + else: + chunk_size = chunk_size % self.short_chunk_size + 1 + + mask = ~subsequent_chunk_mask( + size=x.size(0), + chunk_size=chunk_size, + num_left_chunks=self.num_left_chunks, + device=x.device, + ) + x = self.encoder( + x, + pos_emb, + mask=mask, + src_key_padding_mask=src_key_padding_mask, + warmup=warmup, + ) # (T, N, C) + else: + x = self.encoder( + x, + pos_emb, + mask=None, + src_key_padding_mask=src_key_padding_mask, + warmup=warmup, + ) # (T, N, C) + + x = x.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + return x, lengths + +class Decoder(nn.Module): + """This class modifies the stateless decoder from the following paper: + + RNN-transducer with stateless prediction network + https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419 + + It removes the recurrent connection from the decoder, i.e., the prediction + network. Different from the above paper, it adds an extra Conv1d + right after the embedding layer. + + TODO: Implement https://arxiv.org/pdf/2109.07513.pdf + """ + + def __init__( + self, + vocab_size: int, + decoder_dim: int, + blank_id: int, + context_size: int, + ): + """ + Args: + vocab_size: + Number of tokens of the modeling unit including blank. + decoder_dim: + Dimension of the input embedding, and of the decoder output. + blank_id: + The ID of the blank symbol. + context_size: + Number of previous words to use to predict the next word. + 1 means bigram; 2 means trigram. n means (n+1)-gram. + """ + super().__init__() + + self.embedding = ScaledEmbedding( + num_embeddings=vocab_size, + embedding_dim=decoder_dim, + padding_idx=blank_id, + ) + self.blank_id = blank_id + + assert context_size >= 1, context_size + self.context_size = context_size + self.vocab_size = vocab_size + if context_size > 1: + self.conv = ScaledConv1d( + in_channels=decoder_dim, + out_channels=decoder_dim, + kernel_size=context_size, + padding=0, + groups=decoder_dim, + bias=False, + ) + + def forward(self, y: torch.Tensor) -> torch.Tensor: + """ + Args: + y: + A 2-D tensor of shape (N, U). + need_pad: + True to left pad the input. Should be True during training. + False to not pad the input. Should be False during inference. + Returns: + Return a tensor of shape (N, U, decoder_dim). + """ + y = y.to(torch.int64) + embedding_out = self.embedding(y) + if self.context_size > 1: + embedding_out = embedding_out.permute(0, 2, 1) + + # During inference time, there is no need to do extra padding + # as we only need one output + assert embedding_out.size(-1) == self.context_size + embedding_out = self.conv(embedding_out) + embedding_out = embedding_out.permute(0, 2, 1) + embedding_out = F.relu(embedding_out) + return embedding_out + +class Joiner(nn.Module): + def __init__( + self, + encoder_dim: int, + decoder_dim: int, + joiner_dim: int, + vocab_size: int, + ): + super().__init__() + + self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim) + self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim) + self.output_linear = ScaledLinear(joiner_dim, vocab_size) + + def forward( + self, + encoder_out: torch.Tensor, + decoder_out: torch.Tensor, + ) -> torch.Tensor: + """ + Args: + encoder_out: + Output from the encoder. Its shape is (N, T, s_range, C). + decoder_out: + Output from the decoder. Its shape is (N, T, s_range, C). + project_input: + If true, apply input projections encoder_proj and decoder_proj. + If this is false, it is the user's responsibility to do this + manually. + Returns: + Return a tensor of shape (N, T, s_range, C). + """ + if not is_jit_tracing(): + assert encoder_out.ndim == decoder_out.ndim + assert encoder_out.ndim in (2, 4) + assert encoder_out.shape == decoder_out.shape + + + logit = self.encoder_proj(encoder_out) + self.decoder_proj( + decoder_out + ) + + logit = self.output_linear(torch.tanh(logit)) + + return logit + +def get_encoder_model(params: AttributeDict) -> nn.Module: + # TODO: We can add an option to switch between Conformer and Transformer + encoder = Conformer( + num_features=params.feature_dim, + subsampling_factor=params.subsampling_factor, + d_model=params.encoder_dim, + nhead=params.nhead, + dim_feedforward=params.dim_feedforward, + num_encoder_layers=params.num_encoder_layers, + dynamic_chunk_training=params.dynamic_chunk_training, + short_chunk_size=params.short_chunk_size, + num_left_chunks=params.num_left_chunks, + causal=params.causal_convolution, + ) + return encoder + + +def get_decoder_model(params: AttributeDict) -> nn.Module: + decoder = Decoder( + vocab_size=params.vocab_size, + decoder_dim=params.decoder_dim, + blank_id=params.blank_id, + context_size=params.context_size, + ) + return decoder + + +def get_joiner_model(params: AttributeDict) -> nn.Module: + joiner = Joiner( + encoder_dim=params.encoder_dim, + decoder_dim=params.decoder_dim, + joiner_dim=params.joiner_dim, + vocab_size=params.vocab_size, + ) + return joiner + +def get_transducer_model( + params: AttributeDict, +) -> nn.Module: + encoder = get_encoder_model(params) + decoder = get_decoder_model(params) + joiner = get_joiner_model(params) + + model = Transducer( + encoder=encoder, + decoder=decoder, + joiner=joiner, + encoder_dim=params.encoder_dim, + decoder_dim=params.decoder_dim, + joiner_dim=params.joiner_dim, + vocab_size=params.vocab_size, + ) + return model \ No newline at end of file diff --git a/triton/scripts/test_features/input_tensor_fp32.dat b/triton/scripts/test_features/input_tensor_fp32.dat new file mode 100644 index 000000000..5b9f133b3 Binary files /dev/null and b/triton/scripts/test_features/input_tensor_fp32.dat differ diff --git a/triton/scripts/test_features/shape.bin b/triton/scripts/test_features/shape.bin new file mode 100644 index 000000000..10c3372f1 Binary files /dev/null and b/triton/scripts/test_features/shape.bin differ diff --git a/triton/speech_llm/Dockerfile.server b/triton/speech_llm/Dockerfile.server new file mode 100755 index 000000000..3b9cea401 --- /dev/null +++ b/triton/speech_llm/Dockerfile.server @@ -0,0 +1,10 @@ +FROM nvcr.io/nvidia/tritonserver:24.11-trtllm-python-py3 +WORKDIR /workspace +RUN pip install kaldialign soundfile tritonclient[grpc]==2.31 +COPY build.sh . +COPY model_repo_whisper_qwen_trtllm model_repo_whisper_qwen_trtllm +COPY fill_template.py . + + + + diff --git a/triton/speech_llm/README.md b/triton/speech_llm/README.md new file mode 100755 index 000000000..098a3ae53 --- /dev/null +++ b/triton/speech_llm/README.md @@ -0,0 +1,61 @@ +## Triton Inference Serving Best Practice for Speech LLM + +### Model Training +See https://github.com/k2-fsa/icefall/tree/master/egs/speech_llm/ASR_LLM. + +### Quick Start +Directly launch the service using docker compose. +```sh +# MODEL_ID supports whisper_qwen_1.5B and whisper_qwen_7B +MODEL_ID=whisper_qwen_1.5B docker compose up +``` + +### Build Image +Build the docker image from scratch. +```sh +# build from scratch, cd to the parent dir of Dockerfile.server +docker build . -f Dockerfile.server -t soar97/triton-speech-llm:24.11 +``` + +### Create Docker Container +```sh +your_mount_dir=/mnt:/mnt +docker run -it --name "whisper-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-speech-llm:24.11 +``` + +### Export Models to TensorRT-LLM and Launch Server +Inside docker container, we would follow the official guide of TensorRT-LLM to build qwen and whisper TensorRT-LLM engines. See [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper). + +```sh +bash build.sh +``` + + +### Benchmark using Dataset +```sh +git clone https://github.com/yuekaizhang/Triton-ASR-Client.git +cd Triton-ASR-Client +num_task=16 +python3 client.py \ + --server-addr localhost \ + --model-name infer_bls \ + --num-tasks $num_task \ + --manifest-dir ./datasets/aishell1_test \ + --compute-cer +``` + +### Benchmark Results +Decoding on a single A10 GPU, audios without padding, using aishell1 test set files + +| Model | Backend | Concurrency | RTFx | RTF | +|-------|-----------|-----------------------|---------|--| +| Whisper Large-v2 Encoder + Qwen 1.5B | python backend speech encoder + trt-llm backend llm | 8 | 156 | 0.0064| \ No newline at end of file diff --git a/triton/speech_llm/build.sh b/triton/speech_llm/build.sh new file mode 100755 index 000000000..eb1810f98 --- /dev/null +++ b/triton/speech_llm/build.sh @@ -0,0 +1,38 @@ +CUDA_VISIBLE_DEVICES=0 +MODEL_ID=$1 +if [[ "$MODEL_ID" == "whisper_qwen_1.5B" ]]; then + huggingface_checkpoint_dir="./whisper_qwen_1.5B" + repo="yuekai/whisper_qwen_multi_hans_zh_triton_checkpoint" + adapter_dir="$huggingface_checkpoint_dir/icefall_asr_multi-hans_whisper_qwen2_1.5B/epoch-2-avg-6.pt" + engine_path="$huggingface_checkpoint_dir/qwen2_1.5B_instruct_fp16_merged" +elif [[ "$MODEL_ID" == "whisper_qwen_7B" ]]; then + huggingface_checkpoint_dir="./whisper_qwen_7B" + repo="yuekai/whisper_qwen_7b_multi_hans_zh_triton_checkpoint" + adapter_dir="$huggingface_checkpoint_dir/icefall_asr_multi-hans_whisper_qwen2_7B/epoch-999.pt" + engine_path="$huggingface_checkpoint_dir/qwen2_7B_instruct_int8_woq_merged" +else + echo "Error: Invalid Model_id provided. Please use 'whisper_qwen_1.5B' or 'whisper_qwen_7B'." + exit 1 +fi + +huggingface-cli download --local-dir $huggingface_checkpoint_dir $repo +cd $huggingface_checkpoint_dir && bash build_qwen.sh && bash build_whisper_encoder.sh && cd - + +model_repo=./model_repo_whisper_qwen_trtllm_exp +rm -rf $model_repo +cp -r ./model_repo_whisper_qwen_trtllm $model_repo || exit 1 + + +encoder_engine_dir=$huggingface_checkpoint_dir/whisper_multi_zh + +max_batch=16 +decoupled_mode=false +max_queue_delay_microseconds=0 +n_mels=80 +n_instances=8 +python3 fill_template.py -i $model_repo/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:$max_batch,decoupled_mode:${decoupled_mode},max_beam_width:1,engine_dir:${engine_path},max_tokens_in_paged_kv_cache:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:${max_queue_delay_microseconds} +python3 fill_template.py -i $model_repo/speech_encoder/config.pbtxt triton_max_batch_size:$max_batch,n_mels:$n_mels,adapter_dir:$adapter_dir,encoder_engine_dir:$encoder_engine_dir,max_queue_delay_microseconds:${max_queue_delay_microseconds} +python3 fill_template.py -i $model_repo/infer_bls/config.pbtxt triton_max_batch_size:$max_batch,n_instances:$n_instances,decoupled_mode:${decoupled_mode},max_queue_delay_microseconds:${max_queue_delay_microseconds} + + +tritonserver --model-repository=$model_repo/ diff --git a/triton/speech_llm/docker-compose.yml b/triton/speech_llm/docker-compose.yml new file mode 100755 index 000000000..3b192e58f --- /dev/null +++ b/triton/speech_llm/docker-compose.yml @@ -0,0 +1,20 @@ +services: + asr: + image: soar97/triton-speech-llm:24.11 + shm_size: '1gb' + ports: + - "8000:8000" + - "8001:8001" + - "8002:8002" + environment: + - PYTHONIOENCODING=utf-8 + - MODEL_ID=${MODEL_ID} + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] + command: > + /bin/bash -c "bash build.sh ${MODEL_ID}" diff --git a/triton/speech_llm/fill_template.py b/triton/speech_llm/fill_template.py new file mode 100755 index 000000000..584a9f420 --- /dev/null +++ b/triton/speech_llm/fill_template.py @@ -0,0 +1,42 @@ +#! /usr/bin/env python3 +from argparse import ArgumentParser +from string import Template + + +def main(file_path, substitutions, in_place, participant_ids): + with open(file_path) as f: + pbtxt = Template(f.read()) + + sub_dict = {"max_queue_size": 0} + sub_dict["participant_ids"] = participant_ids + for sub in substitutions.split(","): + key, value = sub.split(":") + sub_dict[key] = value + + pbtxt = pbtxt.safe_substitute(sub_dict) + + if in_place: + with open(file_path, "w") as f: + f.write(pbtxt) + else: + print(pbtxt) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("file_path", help="path of the .pbtxt to modify") + parser.add_argument( + "substitutions", + help= + "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..." + ) + parser.add_argument("--in_place", + "-i", + action="store_true", + help="do the operation in-place") + parser.add_argument("--participant_ids", + help="Participant IDs for the model", + default="") + args = parser.parse_args() + + main(**vars(args)) diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/infer_bls/1/model.py b/triton/speech_llm/model_repo_whisper_qwen_trtllm/infer_bls/1/model.py new file mode 100755 index 000000000..94ed62923 --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/infer_bls/1/model.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +import triton_python_backend_utils as pb_utils +import numpy as np +import json +import torch +from torch.utils.dlpack import from_dlpack, to_dlpack +import transformers +from transformers import AutoTokenizer +from typing import Dict +from pathlib import Path +import traceback + +DEFAULT_SPEECH_TOKEN = "" +TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}" + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.device = torch.device("cuda") + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct") + tokenizer.padding_side = "left" + special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]} + tokenizer.add_special_tokens(special_tokens_dict) + self.tokenizer = tokenizer + + self.eos = self.tokenizer.eos_token_id + self.default_speech_token_id = tokenizer.convert_tokens_to_ids( + DEFAULT_SPEECH_TOKEN + ) + # https://huggingface.co/Qwen/Qwen2-1.5B-Instruct/blob/main/config.json#L26 + self.vocab_size = 151936 + self.logger = pb_utils.Logger + + # TODO: get the decoupled flag from the model config + self.decoupled = False + + def _tokenize(self, num_speech_tokens, prompt=None): + def preprocess( + messages, + tokenizer: transformers.PreTrainedTokenizer, + max_len: int = 128, + ) -> Dict: + """Preprocesses the data for supervised fine-tuning.""" + texts = [] + for i, msg in enumerate(messages): + texts.append( + tokenizer.apply_chat_template( + msg, + tokenize=True, + add_generation_prompt=False, + chat_template=TEMPLATE, + padding="longest", + max_length=max_len, + truncation=True, + ) + ) + max_len_texts = max([len(text) for text in texts]) + if tokenizer.padding_side == "right": + texts = [ + text + [tokenizer.pad_token_id] * (max_len_texts - len(text)) + for text in texts + ] + else: + texts = [ + [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text + for text in texts + ] + + input_ids = torch.tensor(texts, dtype=torch.int) + + attention_mask = input_ids.ne(tokenizer.pad_token_id) + + return input_ids, attention_mask + + if prompt is None: + prompts = [ + [ + {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"}, + {"role": "assistant", "content": ""}, + ] + ] + input_ids, _ = preprocess(prompts, self.tokenizer) + input_ids = input_ids.tolist()[0] + speech_token_index = input_ids.index(self.default_speech_token_id) + prompt_ids = input_ids[:speech_token_index] + list(range(self.vocab_size, self.vocab_size + num_speech_tokens)) + input_ids[speech_token_index + 1:] + return prompt_ids + + def _prepare_inputs(self, request, speech_embeddings, input_ids): + """ + Prepares inputs for the language model based on the parameters in the + request, image features, and prompt. It tokenizes prompt, + extracts and processes additional parameters from the request: + - max_tokens: Maximum number of tokens to generate (default: 50) + - temperature: Controls randomness in generation (default: 0.5) + - top_k: Top K sampling parameter (default: 1) + - frequency_penalty: Penalizes frequent tokens (default: 0.7) + - seed: Random seed for generation (default: 10) + + Final llm input dictionary is combined out of all processed parameters, + prompt's tokens and image features. The latter will be passed to llm + through `prompt_embedding_table`. + + Parameters + ---------- + - request: The original request object containing additional parameters. + - image_features (list): A list containing image feature tensors. + - prompt (str): The text prompt to be processed. + + Returns + ------- + - dict: A dictionary containing all the prepared inputs for the language model. + """ + input_ids = np.array(input_ids, dtype=np.int32) + # TODO: max_tokens should be in the model config + max_tokens = 200 + input_len = input_ids.shape[0] + + embedding_args = { + "prompt_vocab_size": np.array( + [[speech_embeddings.shape[1]]], dtype=np.int32 + ), + "prompt_embedding_table": speech_embeddings.detach().cpu().numpy(), + } + + input_dict = { + "input_ids": np.expand_dims(input_ids, 0), + "input_lengths": np.array([[input_len]], dtype=np.int32), + "request_output_len": np.array([[max_tokens]], dtype=np.int32), + "runtime_top_k": np.array([[1]], dtype=np.int32), + "end_id": np.array([[self.tokenizer.eos_token_id]], dtype=np.int32), + "pad_id": np.array([[self.tokenizer.pad_token_id]], dtype=np.int32), + "streaming": np.array([[0]], dtype=np.bool_), + **embedding_args, + } + + input_tensor_list = [pb_utils.Tensor(k, v) for k, v in input_dict.items()] + return input_tensor_list + + def _prepare_llm_response(self, llm_request_inputs): + """ + Prepares the response from the language model based on the provided + inputs. Creates a `pb_utils.InferenceRequest` object with passed + `llm_request_inputs` to send to a decoupled TensorRTLLM model. + For each response from the language model: + - Checks for errors and raise an exception if any are found. + - Extracts the "output_ids" tensor from the response. + - Determines the finish reason based on the presence of the + end-of-sequence token or reaching the maximum length. + - Appends the generated token IDs to `output_ids`. + - If the finish reason is determined, decodes the output IDs to text + and prepares the final response. + + The final response includes the generated text, finish reason, + completion tokens, prompt tokens, and total tokens. + + Parameters + ---------- + - llm_request_inputs (dict): A dictionary containing the inputs for the language model. + + Returns + ------- + - pb_utils.InferenceResponse: The response object containing the generated text and additional metadata. + """ + + llm_request = pb_utils.InferenceRequest( + model_name="tensorrt_llm", + requested_output_names=["output_ids", "sequence_length"], + inputs=llm_request_inputs, + ) + output_ids, output_len = [], 0 + responses = llm_request.exec(decoupled=False) + responses = [responses] + for llm_response in responses: + if llm_response.has_error(): + raise pb_utils.TritonModelException(llm_response.error().message()) + stream_output_ids = ( + pb_utils.get_output_tensor_by_name(llm_response, "output_ids") + .as_numpy() + .flatten() + .tolist() + ) + # TODO: support finish_reason + finish_reason = "test" + if len(stream_output_ids) == 0 or ( + len(stream_output_ids) != 0 + and stream_output_ids[-1] == self.eos + ): + finish_reason = "stop" + + output_ids += stream_output_ids + + last_response = finish_reason != "" + output_len = len(output_ids) + if last_response: + output_text = self.tokenizer.decode(output_ids).strip() + response = pb_utils.InferenceResponse( + output_tensors=[ + pb_utils.Tensor("TRANSCRIPTS", np.array([output_text], np.object_)), + ] + ) + yield response + + def _extract_speech_embeddings(self, wav, wav_len): + wav = torch.from_numpy(wav[0]).to(self.device) + wav_tensor = pb_utils.Tensor.from_dlpack("WAV", to_dlpack(wav.unsqueeze(0))) + wav_len_tensor = pb_utils.Tensor("WAV_LENS", np.array([[wav_len]], np.int32)) + + infer_request = pb_utils.InferenceRequest( + model_name="speech_encoder", + requested_output_names=["speech_features"], + inputs=[wav_tensor, wav_len_tensor], + ) + inference_response = infer_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + speech_features = pb_utils.get_output_tensor_by_name(inference_response, "speech_features") + speech_features = torch.utils.dlpack.from_dlpack(speech_features.to_dlpack()) + + return speech_features + + def execute(self, requests): + responses = [] + for request in requests: + wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy() + assert wav.shape[0] == 1, "Only support batch size 1 for now" + wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy() + wav_len = wav_len.item() + + speech_embeddings = self._extract_speech_embeddings(wav, wav_len) + #TODO: get the prompts from input tensors + input_ids = self._tokenize(num_speech_tokens=speech_embeddings.shape[1]) + + if self.decoupled: + response_sender = request.get_response_sender() + try: + + llm_request_inputs = self._prepare_inputs( + request, speech_embeddings, input_ids + ) + if isinstance(llm_request_inputs, pb_utils.TritonError): + error = pb_utils.InferenceResponse(error=llm_request_inputs) + if self.decoupled: + response_sender.send( + error, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + responses.append(error) + llm_responses = self._prepare_llm_response(llm_request_inputs) + + for triton_response in llm_responses: + if self.decoupled: + response_sender.send(triton_response) + else: + responses.append(triton_response) + + if self.decoupled: + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + except Exception: + self.logger.log_error(traceback.format_exc()) + # If encountering an error, send a response with err msg + error_response = pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(traceback.format_exc())) + + if self.decoupled: + response_sender.send(error_response) + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + else: + responses.append(error_response) + + if self.decoupled: + return None + else: + assert len(responses) == len(requests) + return responses diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/infer_bls/config.pbtxt b/triton/speech_llm/model_repo_whisper_qwen_trtllm/infer_bls/config.pbtxt new file mode 100755 index 000000000..08ca0529b --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/infer_bls/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "infer_bls" +backend: "python" +max_batch_size: ${triton_max_batch_size} + +model_transaction_policy { + decoupled: ${decoupled_mode} +} + +dynamic_batching { + preferred_batch_size: [ ${triton_max_batch_size} ] + max_queue_delay_microseconds: ${max_queue_delay_microseconds} + } + +input [ + { + name: "TEXT_PREFIX" + data_type: TYPE_STRING + dims: [1] + }, + { + name: "WAV" + data_type: TYPE_FP32 + dims: [-1] + }, + { + name: "WAV_LENS" + data_type: TYPE_INT32 + dims: [1] + } +] + +output [ + { + name: "TRANSCRIPTS" + data_type: TYPE_STRING + dims: [1] + } +] + + +instance_group [ + { + count: ${n_instances} + kind: KIND_CPU + } + ] diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/fbank.py b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/fbank.py new file mode 100755 index 000000000..f60927fd9 --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/fbank.py @@ -0,0 +1,98 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py +import numpy as np +import torch +import torch.nn.functional as F +from typing import Union +import os + +def mel_filters(device, n_mels: int =128) -> torch.Tensor: + """ + load the mel filterbank matrix for projecting STFT into a Mel spectrogram. + Allows decoupling librosa dependency; saved using: + + np.savez_compressed( + "mel_filters.npz", + mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), + ) + """ + assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}" + with np.load( + os.path.join(os.path.dirname(__file__), "mel_filters.npz") + ) as f: + return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + + +def log_mel_spectrogram( + audio: Union[torch.Tensor], + filters: torch.Tensor, + n_mels: int = 128, + n_fft: int = 400, + hop_length: int = 160, +): + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio: Union[str, np.ndarray, torch.Tensor], shape = (*) + The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz + + n_mels: int + The number of Mel-frequency filters, only 80 or 128 is supported + + filters: torch.Tensor + + Returns + ------- + torch.Tensor, shape = (128, n_frames) + A Tensor that contains the Mel spectrogram + """ + window = torch.hann_window(n_fft).to(audio.device) + stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True) + magnitudes = stft[..., :-1].abs() ** 2 + + mel_spec = filters @ magnitudes + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + # cast to float 16 + log_spec = log_spec.half() + return log_spec + +class FeatureExtractor(torch.nn.Module): + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def __init__(self, n_mels: int = 128): + self.device = torch.device("cuda") + self.n_mels = n_mels + self.filters = mel_filters(self.device, n_mels=self.n_mels) + + def compute_feature(self, wav, padding_target_len: int = 3000): + """ + Compute the log-Mel spectrogram of the input audio waveform. + mel: [1, feature_dim, seq_len] + """ + mel = log_mel_spectrogram(wav, self.filters) + assert padding_target_len <= 3000, f"padding must be less than 3000, got {padding}" + if mel.shape[1] < padding_target_len: + mel = F.pad(mel, (0, padding_target_len - mel.shape[1]), mode='constant') + if mel.shape[1] % 2: + # pad to even length for remove_padding case, since conv1d requires even length + mel = torch.nn.functional.pad(mel, (0, 1)) + mel = mel.unsqueeze(0) + return mel \ No newline at end of file diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/mel_filters.npz b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/mel_filters.npz new file mode 100755 index 000000000..28ea26909 Binary files /dev/null and b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/mel_filters.npz differ diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/model.py b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/model.py new file mode 100755 index 000000000..0bab93556 --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/1/model.py @@ -0,0 +1,270 @@ + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json +from collections import OrderedDict +from pathlib import Path + +from typing import List, Dict, Any, Tuple +from .fbank import FeatureExtractor +import torch +import torch.nn as nn +from torch.utils.dlpack import from_dlpack, to_dlpack + +import tensorrt_llm +import tensorrt_llm.logger as logger +from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt, + trt_dtype_to_torch) +from tensorrt_llm.runtime import ModelConfig, SamplingConfig +from tensorrt_llm.runtime.session import Session, TensorInfo +import triton_python_backend_utils as pb_utils +import math + +def remove_tensor_padding(input_tensor, + input_tensor_lengths=None, + pad_value=None): + if pad_value: + assert input_tensor_lengths is None, "input_tensor_lengths should be None when pad_value is provided" + # Text tensor case: batch, seq_len + assert torch.all( + input_tensor[:, 0] != pad_value + ), "First token in each sequence should not be pad_value" + assert input_tensor_lengths is None + + # Create a mask for all non-pad tokens + mask = input_tensor != pad_value + + # Apply the mask to input_tensor to remove pad tokens + output_tensor = input_tensor[mask].view(1, -1) + + else: + # Audio tensor case: batch, seq_len, feature_len + # position_ids case: batch, seq_len + assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor" + + # Initialize a list to collect valid sequences + valid_sequences = [] + + for i in range(input_tensor.shape[0]): + valid_length = input_tensor_lengths[i] + valid_sequences.append(input_tensor[i, :valid_length]) + + # Concatenate all valid sequences along the batch dimension + output_tensor = torch.cat(valid_sequences, dim=0) + return output_tensor + +def read_config(component, engine_dir): + config_path = engine_dir / component / 'config.json' + with open(config_path, 'r') as f: + config = json.load(f) + model_config = OrderedDict() + model_config.update(config['pretrained_config']) + model_config.update(config['build_config']) + return model_config + +class WhisperEncoding: + + def __init__(self, engine_dir): + self.session = self.get_session(engine_dir) + config = read_config('encoder', engine_dir) + self.n_mels = config['n_mels'] + self.dtype = config['dtype'] + self.num_languages = config['num_languages'] + self.encoder_config = config + + def get_session(self, engine_dir): + serialize_path = engine_dir / 'encoder' / 'rank0.engine' + with open(serialize_path, 'rb') as f: + session = Session.from_serialized_engine(f.read()) + return session + + def get_audio_features(self, + mel, + mel_input_lengths, + encoder_downsampling_factor=2): + if isinstance(mel, list): + longest_mel = max([f.shape[-1] for f in mel]) + mel = [ + torch.nn.functional.pad(f, (0, longest_mel - f.shape[-1]), mode='constant') + for f in mel + ] + mel = torch.cat(mel, dim=0).type(str_dtype_to_torch("float16")).contiguous() + bsz, seq_len = mel.shape[0], mel.shape[2] + position_ids = torch.arange( + math.ceil(seq_len / encoder_downsampling_factor), + dtype=torch.int32, + device=mel.device).expand(bsz, -1).contiguous() + if self.encoder_config['plugin_config']['remove_input_padding']: + # mel B,D,T -> B,T,D -> BxT, D + mel = mel.transpose(1, 2) + mel = remove_tensor_padding(mel, mel_input_lengths) + position_ids = remove_tensor_padding(position_ids, + mel_input_lengths // encoder_downsampling_factor) + inputs = OrderedDict() + inputs['input_features'] = mel + inputs['input_lengths'] = mel_input_lengths + inputs['position_ids'] = position_ids + + output_list = [ + TensorInfo('input_features', str_dtype_to_trt(self.dtype), + mel.shape), + TensorInfo('input_lengths', str_dtype_to_trt('int32'), + mel_input_lengths.shape), + TensorInfo('position_ids', str_dtype_to_trt('int32'), + inputs['position_ids'].shape) + ] + + output_info = (self.session).infer_shapes(output_list) + + logger.debug(f'output info {output_info}') + outputs = { + t.name: torch.empty(tuple(t.shape), + dtype=trt_dtype_to_torch(t.dtype), + device='cuda') + for t in output_info + } + stream = torch.cuda.current_stream() + ok = self.session.run(inputs=inputs, + outputs=outputs, + stream=stream.cuda_stream) + assert ok, 'Engine execution failed' + stream.synchronize() + encoder_output = outputs['encoder_output'] + encoder_output_lengths = mel_input_lengths // encoder_downsampling_factor + return encoder_output, encoder_output_lengths + +class EncoderProjector(torch.nn.Module): + """ + The encoder projector module. It is used to project the encoder outputs to the same dimension as the language model. + Modified from https://github.com/X-LANCE/SLAM-LLM/blob/main/src/slam_llm/models/projector.py. + Args: + encoder_dim (:obj:`int`): The dimension of the encoder outputs. + llm_dim (:obj:`int`): The dimension of the language model. + downsample_rate (:obj:`int`, `optional`, defaults to 5): The downsample rate to use. + """ + + def __init__(self, encoder_dim=1280, llm_dim=1536, downsample_rate=8): + super().__init__() + self.downsample_rate = downsample_rate + self.linear1 = nn.Linear(encoder_dim * self.downsample_rate, llm_dim) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(llm_dim, llm_dim) + + def forward(self, x): + + batch_size, seq_len, feat_dim = x.size() + num_frames_to_discard = seq_len % self.downsample_rate + if num_frames_to_discard > 0: + x = x[:, :-num_frames_to_discard, :] + seq_len = x.size(1) + + x = x.contiguous() + x = x.view( + batch_size, seq_len // self.downsample_rate, feat_dim * self.downsample_rate + ) + + x = self.linear1(x) + x = self.relu(x) + x = self.linear2(x) + return x + +class WhisperTRTLLM(nn.Module): + + def __init__(self, engine_dir, llm_dim=1536): + super().__init__() + world_size = 1 + runtime_rank = tensorrt_llm.mpi_rank() + runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank) + torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + engine_dir = Path(engine_dir) + + self.encoder = WhisperEncoding(engine_dir) + self.encoder_projector = EncoderProjector(llm_dim=llm_dim) + self.encoder_projector = self.encoder_projector.half().to("cuda") + + def process_batch(self, mel): + mel_input_lengths = torch.tensor([f.shape[-1] for f in mel], + dtype=torch.int32, + device='cuda') + encoder_outputs, encoder_output_lengths = self.encoder.get_audio_features(mel, mel_input_lengths) + if len(encoder_outputs.shape) == 3: + speech_features = self.encoder_projector(encoder_outputs) + speech_features = speech_features.to(torch.float16) + else: + assert len(encoder_outputs.shape) == 2 + speech_features = [] + start = 0 + for length in encoder_output_lengths: + encoder_output = encoder_outputs[start:start + length].unsqueeze(0) + start += length + speech_feature = self.encoder_projector(encoder_output).to(torch.float16).squeeze(0) + speech_features.append(speech_feature) + assert start == encoder_outputs.shape[0] + return speech_features + +class TritonPythonModel: + def initialize(self, args): + device = "cuda" + device_id = args["model_instance_device_id"] + self.device = f"{device}:{device_id}" + self.feature_extractor = FeatureExtractor(n_mels=80) + self.init_model(json.loads(args['model_config'])['parameters']) + + def init_model(self, parameters): + for key,value in parameters.items(): + parameters[key] = value["string_value"] + engine_dir = parameters["engine_dir"] + adapter_dir=parameters["adapter_dir"] + checkpoint = torch.load( + adapter_dir, map_location="cpu" + ) + self.llm_dim = checkpoint["encoder_projector.linear1.weight"].shape[0] + self.model = WhisperTRTLLM(engine_dir, llm_dim=self.llm_dim) + missing_keys, _ = self.model.load_state_dict(checkpoint, strict=False) + assert len(missing_keys) == 0, f"Missing keys: {missing_keys}" + n_mels = int(parameters["n_mels"]) + self.feature_extractor = FeatureExtractor(n_mels=n_mels) + + def execute(self, requests): + """ + This function receives a list of requests (`pb_utils.InferenceRequest`), + performs inference on every request and appends it to responses. + """ + responses, batch_mel_list = [], [] + for request in requests: + wav_tensor = pb_utils.get_input_tensor_by_name(request, "WAV") + wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy().item() + wav = from_dlpack(wav_tensor.to_dlpack()) + wav = wav[:, :wav_len] + padding = 3000 if self.llm_dim == 3584 else 0 # WAR: whisper_llm_7b model needs padding + mel = self.feature_extractor.compute_feature(wav[0].to('cuda'), padding_target_len=padding) + batch_mel_list.append(mel) + + speech_features_list = self.model.process_batch(batch_mel_list) + for i in range(len(requests)): + out_0 = pb_utils.Tensor.from_dlpack("speech_features", to_dlpack(speech_features_list[i].unsqueeze(0))) + responses.append(pb_utils.InferenceResponse([out_0])) + return responses \ No newline at end of file diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/config.pbtxt b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/config.pbtxt new file mode 100755 index 000000000..a7ef1e208 --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/speech_encoder/config.pbtxt @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "speech_encoder" +backend: "python" +max_batch_size: ${triton_max_batch_size} + +dynamic_batching { + preferred_batch_size: [ ${triton_max_batch_size} ] + max_queue_delay_microseconds: ${max_queue_delay_microseconds} +} + +parameters [ + { + key: "adapter_dir", + value: { string_value: "${adapter_dir}"} + }, + { + key: "engine_dir" + value: { string_value: "${encoder_engine_dir}"} + }, + { + key: "n_mels", + value: {string_value:"${n_mels}"} # 128 dim for large-v3, 80 dim for large-v2 + } +] + +input [ + { + name: "mel" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: True + }, + { + name: "WAV" + data_type: TYPE_FP32 + dims: [-1] + optional: True + }, + { + name: "WAV_LENS" + data_type: TYPE_INT32 + dims: [1] + optional: True + } +] +output [ + { + name: "speech_features" + data_type: TYPE_FP16 + dims: [ -1, -1] + } +] + +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/1/.gitkeep b/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/1/model.py b/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/1/model.py new file mode 100755 index 000000000..51c5bc789 --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/1/model.py @@ -0,0 +1,947 @@ +import datetime +import json +import os +import sys +import time +from random import randint +from threading import Lock, Thread + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch import from_numpy +from torch.utils.dlpack import from_dlpack + +import tensorrt_llm.bindings.executor as trtllm + + +def get_input_tensor_by_name(request, + name, + expected_batch_size=None, + batch_index=None): + tensor = pb_utils.get_input_tensor_by_name(request, name) + if tensor is None: + return None + + if tensor.is_cpu(): + tensor = tensor.as_numpy() + else: + tensor = from_dlpack(tensor.to_dlpack()) + + if expected_batch_size is not None and tensor.shape[ + 0] != expected_batch_size: + raise pb_utils.TritonModelException( + f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}" + ) + + if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size: + raise pb_utils.TritonModelException( + f"Invalid batch index in get_input_tensor_by_name for {name}") + + if batch_index is not None: + # Add leading 1 batch dimension + if isinstance(tensor, np.ndarray): + return np.expand_dims(tensor[batch_index], axis=0) + elif isinstance(tensor, torch.Tensor): + return torch.unsqueeze(tensor[batch_index], dim=0) + else: + return tensor + + +def get_input_scalar_by_name(request, + name, + expected_batch_size=1, + batch_index=0): + tensor = pb_utils.get_input_tensor_by_name(request, name) + if tensor is None: + return None + tensor = tensor.as_numpy() + + if tensor.size != expected_batch_size: + raise pb_utils.TritonModelException( + f"Expected a scalar tensor for tensor {name}") + + return tensor.item(batch_index) + + +def read_parameter_as_type(value, name, pytype=str): + if value == "": + return None + if value.startswith("${") and value.endswith("}"): + return None + if pytype is bool: + return value.lower() in ["1", "true"] + try: + result = pytype(value) + return result + except: + pb_utils.Logger.log_warning( + f"Could not read parameter '{name}' with value '{value}', will use default." + ) + return None + + +def get_parameter(model_config, name, pytype=str): + if name not in model_config['parameters']: + return None + return read_parameter_as_type( + model_config['parameters'][name]['string_value'], name, pytype) + + +def convert_word_list(word_list): + if word_list is None: + return None + word_list = word_list.tolist() + if len(word_list) == 0 or len(word_list[0]) != 2: + raise pb_utils.TritonModelException(f"Invalid format for word list.") + words, indices = word_list[0] + result = [] + current_index = 0 + for i in indices: + if i == -1: + continue + if i > len(words): + raise pb_utils.TritonModelException( + f"Invalid format for word list.") + current_word = [] + while current_index < i: + current_word.append(words[current_index]) + current_index += 1 + result.append(current_word) + return result + + +def parse_medusa_choices(medusa_choices): + if medusa_choices is None: + return None + try: + result = json.loads( + "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]") + assert isinstance(result, list) and len(result) > 0 + assert all([isinstance(x, list) for x in result]) + assert all([isinstance(y, int) for x in result for y in x]) + except Exception: + raise pb_utils.TritonModelException( + "Invalid format for medusa_choices") + return result + + +def get_sampling_config_from_request(request, batch_size=1, batch_index=0): + kwargs = {} + kwargs['beam_width'] = get_input_scalar_by_name( + request, 'beam_width', batch_size, batch_index) or 1 + kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k', + batch_size, batch_index) + kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p', + batch_size, batch_index) + kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[ + 'top_p'] <= 0 else kwargs['top_p'] + kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed', + batch_size, batch_index) + kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature', + batch_size, batch_index) + kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length', + batch_size, batch_index) + kwargs['repetition_penalty'] = get_input_scalar_by_name( + request, 'repetition_penalty', batch_size, batch_index) + kwargs['presence_penalty'] = get_input_scalar_by_name( + request, 'presence_penalty', batch_size, batch_index) + kwargs['frequency_penalty'] = get_input_scalar_by_name( + request, 'frequency_penalty', batch_size, batch_index) + kwargs['length_penalty'] = get_input_scalar_by_name( + request, 'len_penalty', batch_size, batch_index) + kwargs['top_p_min'] = get_input_scalar_by_name(request, + 'runtime_top_p_min', + batch_size, batch_index) + kwargs['top_p_reset_ids'] = get_input_scalar_by_name( + request, 'runtime_top_p_reset_ids', batch_size, batch_index) + kwargs['top_p_decay'] = get_input_scalar_by_name(request, + 'runtime_top_p_decay', + batch_size, batch_index) + kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name( + request, 'beam_search_diversity_rate', batch_size, batch_index) + kwargs['early_stopping'] = get_input_scalar_by_name( + request, 'early_stopping', batch_size, batch_index) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.SamplingConfig(**kwargs) + + +def get_output_config_from_request(request, + exclude_input_from_output, + batch_size=1, + batch_index=0): + kwargs = {} + kwargs["return_log_probs"] = get_input_scalar_by_name( + request, 'return_log_probs', batch_size, batch_index) + kwargs["return_context_logits"] = get_input_scalar_by_name( + request, 'return_context_logits', batch_size, batch_index) + kwargs["return_generation_logits"] = get_input_scalar_by_name( + request, 'return_generation_logits', batch_size, batch_index) + kwargs["exclude_input_from_output"] = exclude_input_from_output + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.OutputConfig(**kwargs) + + +def get_external_draft_tokens_config_from_request(request, + batch_size=1, + batch_index=0): + kwargs = {} + draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids', + batch_size, batch_index) + if draft_input_ids is not None: + kwargs['tokens'] = draft_input_ids[0].tolist() + draft_logits = get_input_tensor_by_name(request, 'draft_logits', + batch_size, batch_index) + if draft_logits is not None: + kwargs['logits'] = from_numpy(draft_logits).squeeze() + kwargs['acceptance_threshold'] = get_input_scalar_by_name( + request, 'draft_acceptance_threshold', batch_size, batch_index) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.ExternalDraftTokensConfig(**kwargs) + return None + + +def get_prompt_tuning_config_from_request(request, + batch_size=1, + batch_index=0): + # prompt_vocab_size is unused by executor. + kwargs = {} + prompt_embedding_table = get_input_tensor_by_name( + request, 'prompt_embedding_table', batch_size, batch_index) + if prompt_embedding_table is not None: + if isinstance(prompt_embedding_table, np.ndarray): + kwargs["embedding_table"] = from_numpy( + prompt_embedding_table).squeeze() + elif isinstance(prompt_embedding_table, torch.Tensor): + kwargs["embedding_table"] = from_dlpack( + prompt_embedding_table.to_dlpack()).squeeze(dim=0) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.PromptTuningConfig(**kwargs) + return None + + +def get_lora_config_from_request(request, batch_size=1, batch_index=0): + kwargs = {} + kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id', + batch_size, batch_index) + lora_weights = get_input_tensor_by_name(request, 'lora_weights', + batch_size, batch_index) + if lora_weights is not None: + kwargs["weights"] = from_numpy(lora_weights).squeeze() + lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size, + batch_index) + if lora_config is not None: + kwargs["config"] = from_numpy(lora_config).squeeze() + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.LoraConfig(**kwargs) + return None + + +def convert_request(request, exclude_input_from_output, decoupled): + inputs = {} + input_token_ids = get_input_tensor_by_name(request, 'input_ids') + if input_token_ids is None: + raise pb_utils.TritonModelException( + "A value is required for input_ids") + if len(input_token_ids.shape) != 2: + raise pb_utils.TritonModelException(f"Invalid format for input_ids") + batch_size = input_token_ids.shape[0] + requests = [] + for batch_index in range(0, batch_size): + input_token_ids = get_input_tensor_by_name(request, 'input_ids', + batch_size, batch_index)[0] + if input_token_ids is None: + raise pb_utils.TritonModelException( + "A value is required for input_ids") + input_token_ids = input_token_ids.tolist() + if len(input_token_ids) == 0: + raise pb_utils.TritonModelException( + f"Invalid format for input_ids") + + input_length = get_input_scalar_by_name(request, 'input_lengths', + batch_size, batch_index) + if input_length is None: + input_length = len(input_token_ids) + # Trim input token ids with input_lengths + inputs['input_token_ids'] = input_token_ids[0:input_length] + + inputs['max_new_tokens'] = get_input_scalar_by_name( + request, 'request_output_len', batch_size, batch_index) + if inputs['max_new_tokens'] is None: + raise pb_utils.TritonModelException( + "A value is required for request_output_len") + inputs['streaming'] = get_input_scalar_by_name(request, 'streaming', + batch_size, batch_index) + if inputs['streaming'] and not decoupled: + raise pb_utils.TritonModelException( + "Streaming is only supported in decoupled mode.") + inputs['end_id'] = get_input_scalar_by_name(request, 'end_id', + batch_size, batch_index) + inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id', + batch_size, batch_index) + inputs['stop_words'] = convert_word_list( + get_input_tensor_by_name(request, 'stop_words_list', batch_size, + batch_index)) + inputs['bad_words'] = convert_word_list( + get_input_tensor_by_name(request, 'bad_words_list', batch_size, + batch_index)) + embedding_bias = get_input_tensor_by_name(request, 'embedding_bias', + batch_size, batch_index) + if embedding_bias is not None and embedding_bias.size != 0: + inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze() + + sampling_config = get_sampling_config_from_request( + request, batch_size, batch_index) + output_config = get_output_config_from_request( + request, exclude_input_from_output, batch_size, batch_index) + external_draft_tokens_config = get_external_draft_tokens_config_from_request( + request, batch_size, batch_index) + prompt_tuning_config = get_prompt_tuning_config_from_request( + request, batch_size, batch_index) + lora_config = get_lora_config_from_request(request, batch_size, + batch_index) + + requests.append( + trtllm.Request( + **inputs, + sampling_config=sampling_config, + output_config=output_config, + external_draft_tokens_config=external_draft_tokens_config, + prompt_tuning_config=prompt_tuning_config, + lora_config=lora_config, + )) + return requests + + +def convert_response(response, batch_index): + if response.has_error(): + return pb_utils.InferenceResponse(output_tensors=[], + error=pb_utils.TritonError( + response.error_msg)), True + result = response.result + beam_lengths = np.expand_dims( + np.array([len(beam) for beam in result.output_token_ids], np.int32), 0) + max_beam_length = max([len(beam) for beam in result.output_token_ids]) + output_ids = np.full((1, len(result.output_token_ids), max_beam_length), + -1, np.int32) + for idx, beam in enumerate(result.output_token_ids): + output_ids[0, idx, :len(beam)] = beam + output_tensors = [ + pb_utils.Tensor("output_ids", output_ids), + pb_utils.Tensor("sequence_length", beam_lengths), + ] + output_tensors.append( + pb_utils.Tensor( + "cum_log_probs", + np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) + if result.cum_log_probs is not None else np.zeros( + (1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "output_log_probs", + np.expand_dims(np.array(result.log_probs, np.float32), 0) if + result.log_probs is not None else np.zeros((1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "context_logits", + np.expand_dims(np.array(result.context_logits, np.float32), 0) + if result.context_logits is not None else np.zeros( + (1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor( + "generation_logits", + np.expand_dims(np.array(result.generation_logits, np.float32), 0) + if result.generation_logits is not None else np.zeros( + (1, 1, 1, 1), np.float32))) + output_tensors.append( + pb_utils.Tensor("batch_index", + np.expand_dims(np.array([batch_index], np.int32), 0))) + + return pb_utils.InferenceResponse(output_tensors), result.is_final + + +def convert_scheduler_policy(batch_scheduler_policy: str): + if batch_scheduler_policy.lower() == "max_utilization": + return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION + elif batch_scheduler_policy.lower() == "guaranteed_no_evict": + return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT + raise pb_utils.TritonModelException( + f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." + ) + + +def convert_batching_type(gpt_model_type: str): + if gpt_model_type is None: + return None + if gpt_model_type.lower( + ) == "inflight_fused_batching" or gpt_model_type.lower( + ) == "inflight_batching": + return trtllm.BatchingType.INFLIGHT + elif gpt_model_type.lower() == "v1": + return trtllm.BatchingType.STATIC + raise pb_utils.TritonModelException( + f"gpt_model_type value of '{gpt_model_type}' is not supported.") + + +def convert_decoding_mode(decoding_mode: str): + if decoding_mode is None: + return None + elif decoding_mode == "auto": + return trtllm.DecodingMode.Auto() + elif decoding_mode == "top_k": + return trtllm.DecodingMode.TopK() + elif decoding_mode == "top_p": + return trtllm.DecodingMode.TopP() + elif decoding_mode == "top_k_top_p": + return trtllm.DecodingMode.TopKTopP() + elif decoding_mode == "beam_search": + return trtllm.DecodingMode.BeamSearch() + elif decoding_mode == "medusa": + return trtllm.DecodingMode.Medusa() + raise pb_utils.TritonModelException( + f"decoding_mode value of '{decoding_mode}' is not supported.") + + +def convert_timestamp_to_seconds(timestamp: str): + return int( + datetime.datetime.strptime(timestamp, + "%m-%d-%Y %H:%M:%S.%f").timestamp()) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def get_scheduler_config(self, model_config): + batch_scheduler_policy = get_parameter(model_config, + "batch_scheduler_policy") + if batch_scheduler_policy is None: + return trtllm.SchedulerConfig() + return trtllm.SchedulerConfig( + convert_scheduler_policy(batch_scheduler_policy)) + + def get_kv_cache_config(self, model_config): + kwargs = { + "enable_block_reuse": + get_parameter(model_config, "enable_kv_cache_reuse", bool), + "max_tokens": + get_parameter(model_config, "max_tokens_in_paged_kv_cache", int), + "sink_token_length": + get_parameter(model_config, "sink_token_length", int), + "free_gpu_memory_fraction": + get_parameter(model_config, "kv_cache_free_gpu_mem_fraction", + float), + "host_cache_size": + get_parameter(model_config, "kv_cache_host_memory_bytes", int), + "onboard_blocks": + get_parameter(model_config, "kv_cache_onboard_blocks", bool), + } + max_attention_window_size = get_parameter(model_config, + "max_attention_window_size") + if max_attention_window_size: + kwargs["max_attention_window"] = [ + int(x) for x in max_attention_window_size.split(",") + ] + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.KvCacheConfig(**kwargs) + + def get_parallel_config(self, model_config): + kwargs = {} + gpu_device_ids = get_parameter(model_config, "gpu_device_ids") + if gpu_device_ids: + kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] + self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", + "0") == "1" + if self.use_orchestrator_mode: + kwargs[ + "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR + worker_path = get_parameter(model_config, "worker_path") + if worker_path is not None: + raise pb_utils.TritonModelException( + "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable." + ) + executor_worker_path = get_parameter(model_config, + "executor_worker_path") + kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( + True, executor_worker_path) + if len(kwargs) > 0: + return trtllm.ParallelConfig(**kwargs) + return None + + def get_peft_cache_config(self, model_config): + kwargs = { + "optimal_adapter_size": + get_parameter(model_config, "lora_cache_optimal_adapter_size", + int), + "max_adapter_size": + get_parameter(model_config, "lora_cache_max_adapter_size", int), + "device_cache_percent": + get_parameter(model_config, "lora_cache_gpu_memory_fraction", + float), + "host_cache_size": + get_parameter(model_config, "lora_cache_host_memory_bytes", int), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.PeftCacheConfig(**kwargs) + + def get_decoding_config(self, model_config): + kwargs = { + "medusa_choices": + parse_medusa_choices(get_parameter(model_config, + "medusa_choices")), + "decoding_mode": + convert_decoding_mode(get_parameter(model_config, + "decoding_mode")), + } + print(kwargs) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.DecodingConfig(**kwargs) + + def get_extended_runtime_perf_knob_config(self, model_config): + kwargs = { + "multi_block_mode": + get_parameter(model_config, "multi_block_mode", bool), + "enable_context_fmha_fp32_acc": + get_parameter(model_config, "enable_context_fmha_fp32_acc", bool) + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs) + + def get_executor_config(self, model_config): + kwargs = { + "max_beam_width": + get_parameter(model_config, "max_beam_width", int), + "scheduler_config": + self.get_scheduler_config(model_config), + "kv_cache_config": + self.get_kv_cache_config(model_config), + "enable_chunked_context": + get_parameter(model_config, "enable_chunked_context", bool), + "normalize_log_probs": + get_parameter(model_config, "normalize_log_probs", bool), + "batching_type": + convert_batching_type(get_parameter(model_config, + "gpt_model_type")), + "parallel_config": + self.get_parallel_config(model_config), + "peft_cache_config": + self.get_peft_cache_config(model_config), + "decoding_config": + self.get_decoding_config(model_config), + "max_queue_size": + model_config.get( + "dynamic_batching", + {}, + ).get( + "default_queue_policy", + {}, + ).get("max_queue_size"), + "extended_runtime_perf_knob_config": + self.get_extended_runtime_perf_knob_config(model_config) + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.ExecutorConfig(**kwargs) + + def create_metrics(self, model: str, version: str, is_v1_model: bool): + self.request_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_request_metrics", + description="TRT LLM request metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.runtime_memory_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_runtime_memory_metrics", + description="TRT LLM runtime memory metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.kv_cache_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_kv_cache_block_metrics", + description="TRT LLM KV cache block metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + model_type = "v1" if is_v1_model else "inflight_batcher" + self.model_type_metric_family = pb_utils.MetricFamily( + name=f"nv_trt_llm_{model_type}_metrics", + description=f"TRT LLM {model_type}-specific metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.general_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_general_metrics", + description="General TRT LLM metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + common_labels = {"model": model, "version": version} + self.all_metrics = { + # Request metrics + "num_active_requests": + self.request_metric_family.Metric(labels={ + "request_type": "active", + **common_labels + }), + "max_num_active_requests": + self.request_metric_family.Metric(labels={ + "request_type": "max", + **common_labels + }), + "num_scheduled_requests": + self.request_metric_family.Metric(labels={ + "request_type": "scheduled", + **common_labels + }), + "num_context_requests": + self.request_metric_family.Metric(labels={ + "request_type": "context", + **common_labels + }), + # Runtime metrics + "cpu_mem_usage": + self.runtime_memory_metric_family.Metric(labels={ + "memory_type": "cpu", + **common_labels + }), + "gpu_mem_usage": + self.runtime_memory_metric_family.Metric(labels={ + "memory_type": "gpu", + **common_labels + }), + "pinned_mem_usage": + self.runtime_memory_metric_family.Metric(labels={ + "memory_type": "pinned", + **common_labels + }), + # KV cache metrics + "max_num_blocks": + self.kv_cache_metric_family.Metric(labels={ + "kv_cache_block_type": "max", + **common_labels + }), + "free_num_blocks": + self.kv_cache_metric_family.Metric(labels={ + "kv_cache_block_type": "free", + **common_labels + }), + "used_num_blocks": + self.kv_cache_metric_family.Metric(labels={ + "kv_cache_block_type": "used", + **common_labels + }), + "tokens_per_block": + self.kv_cache_metric_family.Metric(labels={ + "kv_cache_block_type": "tokens_per", + **common_labels + }), + # General metrics + "timestamp": + self.general_metric_family.Metric(labels={ + "general_type": "timestamp", + **common_labels + }), + "iter": + self.general_metric_family.Metric(labels={ + "general_type": "iteration_counter", + **common_labels + }), + } + if is_v1_model: + self.all_metrics.update({ + "num_ctx_tokens": + self.model_type_metric_family.Metric(labels={ + "v1_specific_metric": "total_context_tokens", + **common_labels + }), + "num_gen_tokens": + self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "total_generation_tokens", + **common_labels + }), + "empty_gen_slots": + self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "empty_generation_slots", + **common_labels + }), + }) + else: + self.all_metrics.update({ + "num_ctx_tokens": + self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": + "total_context_tokens", + **common_labels + }), + "num_gen_requests": + self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": + "generation_requests", + **common_labels + }), + "micro_batch_id": + self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "micro_batch_id", + **common_labels + }), + "num_paused_requests": + self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "paused_requests", + **common_labels + }), + }) + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + model_config = json.loads(args['model_config']) + gpt_model_path = get_parameter(model_config, "gpt_model_path") + if get_parameter(model_config, "enable_trt_overlap", bool): + raise pb_utils.TritonModelException( + f"enable_trt_overlap=true is not supported.") + self.exclude_input_from_output = get_parameter( + model_config, "exclude_input_in_output", bool) + executor_config = self.get_executor_config(model_config) + self.executor = trtllm.Executor(gpt_model_path, + trtllm.ModelType.DECODER_ONLY, + executor_config) + self.decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config) + self.cancellation_check_period_ms = get_parameter( + model_config, "cancellation_check_period_ms", int) or 100 + self.stats_check_period_ms = get_parameter( + model_config, "stats_check_period_ms", int) or 100 + + if not self.decoupled: + raise pb_utils.TritonModelException( + "Please enable decoupled transaction policy in the model configuration to serve this model" + ) + + self.create_metrics(args["model_name"], + args["model_version"], + is_v1_model=executor_config.batching_type == + trtllm.BatchingType.STATIC) + self.triton_user_id_to_req_ids = {} + self.triton_req_id_to_req_ids = {} + self.req_id_to_request_data = {} + self.lock = Lock() + self.running = False + self.awaiter_thread = Thread(target=self.awaiter_loop) + self.cancellation_thread = Thread(target=self.cancellation_loop) + self.metrics_thread = Thread(target=self.metrics_loop) + if self.executor.can_enqueue_requests(): + self.running = True + self.awaiter_thread.start() + self.cancellation_thread.start() + self.metrics_thread.start() + else: + # In leader mode, worker ranks will wait here until leader is done. + self.executor.shutdown() + + def handle_stop_request(self, triton_user_id, response_sender): + if triton_user_id is None or triton_user_id == "": + response_sender.send( + pb_utils.InferenceResponse(error=pb_utils.TritonError( + "A request id must be provided for request cancellation")), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + return + + with self.lock: + if triton_user_id in self.triton_user_id_to_req_ids: + req_ids = self.triton_user_id_to_req_ids[triton_user_id] + for req_id in req_ids: + self.executor.cancel_request(req_id) + + response_sender.send( + pb_utils.InferenceResponse(), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + if not self.executor.can_enqueue_requests(): + return + + # Convert to executor requests. + + triton_requests = [] + executor_requests = [] + batch_indices = [] + triton_user_ids = [] + triton_req_ids = [] + + for request in requests: + + triton_user_id = request.request_id() + + response_sender = request.get_response_sender() + stop = get_input_scalar_by_name(request, 'stop') + + if stop: + self.handle_stop_request(triton_user_id, response_sender) + else: + #Unique request id used to identify each triton request + triton_req_id = str(randint(0, sys.maxsize)) + self.triton_req_id_to_req_ids[triton_req_id] = set() + if triton_user_id is not None and triton_user_id != "": + self.triton_user_id_to_req_ids[triton_user_id] = set() + + try: + converted_reqs = convert_request( + request, self.exclude_input_from_output, + self.decoupled) + except Exception as e: + response_sender.send( + pb_utils.InferenceResponse(error=pb_utils.TritonError( + f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" + )), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + else: + for batch_index, converted_req in enumerate( + converted_reqs): + triton_requests.append(request) + executor_requests.append(converted_req) + triton_user_ids.append(triton_user_id) + triton_req_ids.append(triton_req_id) + batch_indices.append(batch_index) + + with self.lock: + request_ids = self.executor.enqueue_requests(executor_requests) + for req_id, triton_req_id, triton_user_id, triton_request, batch_index in zip( + request_ids, triton_req_ids, triton_user_ids, + triton_requests, batch_indices): + self.req_id_to_request_data[ + req_id] = triton_req_id, triton_user_id, batch_index, triton_request.get_response_sender( + ) + self.triton_req_id_to_req_ids[triton_req_id].add(req_id) + if triton_user_id is not None and triton_user_id != "": + self.triton_user_id_to_req_ids[triton_user_id].add(req_id) + + return None + + def awaiter_loop(self): + """Gets responses from executor and returns the results.""" + while self.running: + for response in self.executor.await_responses( + timeout=datetime.timedelta(milliseconds=1)): + req_id = response.request_id + with self.lock: + if req_id not in self.req_id_to_request_data: + continue + triton_req_id, triton_user_id, batch_index, response_sender = self.req_id_to_request_data[ + req_id] + + triton_response, is_final = convert_response( + response, batch_index) + + triton_request_final = False + if is_final: + with self.lock: + # Check if all executor requests part of that triton request are finished + self.triton_req_id_to_req_ids[triton_req_id].remove( + req_id) + if len(self.triton_req_id_to_req_ids[triton_req_id] + ) == 0: + pb_utils.Logger.log_info( + f"DELETING Req id {req_id}, triton_req_id {triton_req_id} " + ) + triton_request_final = True + del self.triton_req_id_to_req_ids[triton_req_id] + if triton_user_id is not None and triton_user_id != "": + del self.triton_user_id_to_req_ids[ + triton_user_id] + del self.req_id_to_request_data[req_id] + + response_sender.send( + triton_response, + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + if triton_request_final else 0) + + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def cancellation_loop(self): + """Checks if any pending requests have been cancelled.""" + while self.running: + time.sleep(self.cancellation_check_period_ms / 1000.0) + with self.lock: + for req_id, (triton_req_id, triton_user_id, batch_index, + response_sender + ) in self.req_id_to_request_data.items(): + if response_sender.is_cancelled(): + self.executor.cancel_request(req_id) + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def metrics_loop(self): + """Updates triton metrics using stats from the executor.""" + while self.running: + time.sleep(self.stats_check_period_ms / 1000.0) + for stat in self.executor.get_latest_iteration_stats(): + try: + for key, metric in self.all_metrics.items(): + value = None + if hasattr(stat, key): + value = getattr(stat, key) + elif stat.kv_cache_stats is not None and hasattr( + stat.kv_cache_stats, key): + value = getattr(stat.kv_cache_stats, key) + elif stat.static_batching_stats is not None and hasattr( + stat.static_batching_stats, key): + value = getattr(stat.static_batching_stats, key) + elif stat.inflight_batching_stats is not None and hasattr( + stat.inflight_batching_stats, key): + value = getattr(stat.inflight_batching_stats, key) + if value is not None: + if key == "timestamp": + value = convert_timestamp_to_seconds(value) + metric.set(value) + else: + pb_utils.Logger.log_warn( + f"Metric \"{key}\" not found.") + except Exception as e: + pb_utils.Logger.log_warn( + f"Error while processing metrics: {e}") + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + if self.executor.can_enqueue_requests(): + self.running = False + self.awaiter_thread.join() + self.cancellation_thread.join() + self.metrics_thread.join() + self.executor.shutdown() diff --git a/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/config.pbtxt b/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/config.pbtxt new file mode 100755 index 000000000..4bd761bba --- /dev/null +++ b/triton/speech_llm/model_repo_whisper_qwen_trtllm/tensorrt_llm/config.pbtxt @@ -0,0 +1,577 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "tensorrt_llm" +backend: "${triton_backend}" +max_batch_size: ${triton_max_batch_size} + +model_transaction_policy { + decoupled: ${decoupled_mode} +} + +dynamic_batching { + preferred_batch_size: [ ${triton_max_batch_size} ] + max_queue_delay_microseconds: ${max_queue_delay_microseconds} + default_queue_policy: { max_queue_size: ${max_queue_size} } +} + +input [ + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + allow_ragged_batch: true + optional: true + }, + { + name: "encoder_input_features" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + allow_ragged_batch: true + optional: true + }, + { + name: "encoder_output_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "draft_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, + { + name: "draft_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "bad_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "embedding_bias" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "streaming" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + # the unique task ID for the given LoRA. + # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. + # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. + # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. + { + name: "lora_task_id" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] + # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer + # each of the in / out tensors are first flattened and then concatenated together in the format above. + # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. + { + name: "lora_weights" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + # module identifier (same size a first dimension of lora_weights) + # See LoraModule::ModuleType for model id mapping + # + # "attn_qkv": 0 # compbined qkv adapter + # "attn_q": 1 # q adapter + # "attn_k": 2 # k adapter + # "attn_v": 3 # v adapter + # "attn_dense": 4 # adapter for the dense layer in attention + # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection + # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection + # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate + # + # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] + { + name: "lora_config" + data_type: TYPE_INT32 + dims: [ -1, 3 ] + optional: true + allow_ragged_batch: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters: { + key: "max_beam_width" + value: { + string_value: "${max_beam_width}" + } +} +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} +parameters: { + key: "gpt_model_type" + value: { + string_value: "${batching_strategy}" + } +} +parameters: { + key: "gpt_model_path" + value: { + string_value: "${engine_dir}" + } +} +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" + } +} +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "${max_tokens_in_paged_kv_cache}" + } +} +parameters: { + key: "max_attention_window_size" + value: { + string_value: "${max_attention_window_size}" + } +} +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" + } +} +parameters: { + key: "batch_scheduler_policy" + value: { + string_value: "${batch_scheduler_policy}" + } +} +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: "${kv_cache_free_gpu_mem_fraction}" + } +} +parameters: { + key: "kv_cache_host_memory_bytes" + value: { + string_value: "${kv_cache_host_memory_bytes}" + } +} +parameters: { + key: "kv_cache_onboard_blocks" + value: { + string_value: "${kv_cache_onboard_blocks}" + } +} +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } +parameters: { + key: "exclude_input_in_output" + value: { + string_value: "${exclude_input_in_output}" + } +} +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} +parameters: { + key: "enable_kv_cache_reuse" + value: { + string_value: "${enable_kv_cache_reuse}" + } +} +parameters: { + key: "normalize_log_probs" + value: { + string_value: "${normalize_log_probs}" + } +} +parameters: { + key: "enable_chunked_context" + value: { + string_value: "${enable_chunked_context}" + } +} +parameters: { + key: "gpu_device_ids" + value: { + string_value: "${gpu_device_ids}" + } +} +parameters: { + key: "lora_cache_optimal_adapter_size" + value: { + string_value: "${lora_cache_optimal_adapter_size}" + } +} +parameters: { + key: "lora_cache_max_adapter_size" + value: { + string_value: "${lora_cache_max_adapter_size}" + } +} +parameters: { + key: "lora_cache_gpu_memory_fraction" + value: { + string_value: "${lora_cache_gpu_memory_fraction}" + } +} +parameters: { + key: "lora_cache_host_memory_bytes" + value: { + string_value: "${lora_cache_host_memory_bytes}" + } +} +parameters: { + key: "decoding_mode" + value: { + string_value: "${decoding_mode}" + } +} +parameters: { + key: "executor_worker_path" + value: { + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" + } +} +parameters: { + key: "medusa_choices" + value: { + string_value: "${medusa_choices}" + } +} +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} +parameters: { + key: "enable_context_fmha_fp32_acc" + value: { + string_value: "${enable_context_fmha_fp32_acc}" + } +} +parameters: { + key: "multi_block_mode" + value: { + string_value: "${multi_block_mode}" + } +} diff --git a/triton/transducer-scorer-backend/CMakeLists.txt b/triton/transducer-scorer-backend/CMakeLists.txt new file mode 100644 index 000000000..84533e60a --- /dev/null +++ b/triton/transducer-scorer-backend/CMakeLists.txt @@ -0,0 +1,215 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.17...3.22 FATAL_ERROR) + +project(TransducerScorerBackend LANGUAGES C CXX) + +# +# Options +# +# Must include options required for this project as well as any +# projects included in this one by FetchContent. +# +option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) +option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) + +set(TRITON_COMMON_REPO_TAG "r22.11" CACHE STRING "Tag for triton-inference-server/common repo") +set(TRITON_CORE_REPO_TAG "r22.11" CACHE STRING "Tag for triton-inference-server/core repo") +set(TRITON_BACKEND_REPO_TAG "r22.11" CACHE STRING "Tag for triton-inference-server/backend repo") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +# +# Dependencies +# +# FetchContent requires us to include the transitive closure of all +# repos that we depend on so that we can override the tags. +# +include(FetchContent) + +FetchContent_Declare( + repo-common + GIT_REPOSITORY https://github.com/triton-inference-server/common.git + GIT_TAG ${TRITON_COMMON_REPO_TAG} + GIT_SHALLOW ON +) +FetchContent_Declare( + repo-core + GIT_REPOSITORY https://github.com/triton-inference-server/core.git + GIT_TAG ${TRITON_CORE_REPO_TAG} + GIT_SHALLOW ON +) +FetchContent_Declare( + repo-backend + GIT_REPOSITORY https://github.com/triton-inference-server/backend.git + GIT_TAG ${TRITON_BACKEND_REPO_TAG} + GIT_SHALLOW ON +) +FetchContent_MakeAvailable(repo-common repo-core repo-backend) + + +# +# CUDA +# +if(${TRITON_ENABLE_GPU}) + find_package(CUDAToolkit REQUIRED) +endif() + +find_package(Python3 REQUIRED COMPONENTS Development) +# +# The backend must be built into a shared library. Use an ldscript to +# hide all symbols except for the TRITONBACKEND API. +# +configure_file(src/libtriton_scorer.ldscript libtriton_scorer.ldscript COPYONLY) + +add_library( + triton-transudcer-scorer-backend SHARED + src/scorer.cc + src/scorer_utils.h + src/bls.h + src/bls.cc + src/bls_utils.h + src/bls_utils.cc + src/symbol-table.cc + src/symbol-table.h +) + +add_library( + TransducerScorerBackend::triton-transducer-scorer-backend ALIAS triton-transudcer-scorer-backend +) + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) +find_package(Torch REQUIRED) +message(STATUS " torch include path: ${TORCH_INCLUDE_DIRS}") +message(STATUS " torch lib path : ${TORCH_LIBRARIES} ") + +target_include_directories( + triton-transudcer-scorer-backend + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${Python3_INCLUDE_DIRS} +) + +target_include_directories( + triton-transudcer-scorer-backend + PRIVATE ${TORCH_INCLUDE_DIRS} +) + +target_compile_features(triton-transudcer-scorer-backend PRIVATE cxx_std_11) +target_compile_options( + triton-transudcer-scorer-backend PRIVATE + $<$,$,$>: + -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc> +) + +if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + triton-transudcer-scorer-backend + PRIVATE TRITON_ENABLE_GPU=1 + ) +endif() # TRITON_ENABLE_GPU + +target_link_libraries( + triton-transudcer-scorer-backend + PRIVATE + triton-core-serverapi # from repo-core + triton-core-backendapi # from repo-core + triton-core-serverstub # from repo-core + triton-backend-utils # from repo-backend + ${TORCH_LIBRARIES} +) + +if(${TRITON_ENABLE_GPU}) + target_link_libraries( + triton-transudcer-scorer-backend + PRIVATE + CUDA::cudart + ) +endif() # TRITON_ENABLE_GPU + + +set_target_properties( + triton-transudcer-scorer-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_scorer + LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_scorer.ldscript + LINK_FLAGS "-Wl,--version-script libtriton_scorer.ldscript" +) + + +# +# Install +# +include(GNUInstallDirs) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TransducerScorerBackend) + +install( + TARGETS + triton-transudcer-scorer-backend + EXPORT + triton-transudcer-scorer-backend-targets + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/scorer + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/scorer +) + +install( + EXPORT + triton-transudcer-scorer-backend-targets + FILE + TransducerScorerBackendTargets.cmake + NAMESPACE + TransducerScorerBackend:: + DESTINATION + ${INSTALL_CONFIGDIR} +) + +include(CMakePackageConfigHelpers) +configure_package_config_file( + ${CMAKE_CURRENT_LIST_DIR}/cmake/TransducerScorerBackendConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/TransducerScorerBackendConfig.cmake + INSTALL_DESTINATION ${INSTALL_CONFIGDIR} +) + +install( + FILES + ${CMAKE_CURRENT_BINARY_DIR}/TransducerScorerBackendConfig.cmake + DESTINATION ${INSTALL_CONFIGDIR} +) + +# +# Export from build tree +# +export( + EXPORT triton-transudcer-scorer-backend-targets + FILE ${CMAKE_CURRENT_BINARY_DIR}/TransducerScorerBackendTargets.cmake + NAMESPACE TransducerScorerBackend:: +) + +export(PACKAGE TransducerScorerBackend) \ No newline at end of file diff --git a/triton/transducer-scorer-backend/README.md b/triton/transducer-scorer-backend/README.md new file mode 100644 index 000000000..d68c6b0f6 --- /dev/null +++ b/triton/transducer-scorer-backend/README.md @@ -0,0 +1,25 @@ +### Custom backend for scorer module + +This module implements a custom triton backend for scorer module in model_repo_offline. +(Currently, only greedy search method is supported.) + +Comparing with default python scorer backend, this c++ custom backend has better performance but less flexibility. + +``` +# In server docker container, +apt-get install rapidjson-dev +pip3 install cmake==3.22 +rm /usr/bin/cmake +ln /usr/local/bin/cmake /usr/bin/cmake +cmake --version + +# To avoid torch ABI issue, download libtorch here. +wget https://download.pytorch.org/libtorch/cu116/libtorch-cxx11-abi-shared-with-deps-1.13.1%2Bcu116.zip +unzip -d $(pwd)/ libtorch-cxx11-abi-shared-with-deps-1.13.1+cu116.zip +export Torch_DIR=$(pwd)/libtorch +bash build.sh + +# Put the generated libtriton_scorer.so under model_repo_offline/scorer/2 +# Also change backend name in model_repo_offline/scorer/config.pbtxt from backend:"python" to backend: "scorer" + +``` diff --git a/triton/transducer-scorer-backend/build.sh b/triton/transducer-scorer-backend/build.sh new file mode 100644 index 000000000..a5365b01a --- /dev/null +++ b/triton/transducer-scorer-backend/build.sh @@ -0,0 +1,4 @@ +#!/usr/bin/bash + +mkdir -p build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. +make install diff --git a/triton/transducer-scorer-backend/cmake/TransducerScorerBackendConfig.cmake.in b/triton/transducer-scorer-backend/cmake/TransducerScorerBackendConfig.cmake.in new file mode 100644 index 000000000..009425def --- /dev/null +++ b/triton/transducer-scorer-backend/cmake/TransducerScorerBackendConfig.cmake.in @@ -0,0 +1,39 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(CMakeFindDependencyMacro) + +get_filename_component( + TRANSDUCERSCORERBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH +) + +list(APPEND CMAKE_MODULE_PATH ${TRANSDUCERSCORERBACKEND_CMAKE_DIR}) + +if(NOT TARGET TransducerScorerBackend::triton-transducer-scorer-backend) + include("${TRANSDUCERSCORERBACKEND_CMAKE_DIR}/TransducerScorerBackendTargets.cmake") +endif() + +set(TRANSDUCERSCORERBACKEND_LIBRARIES TransducerScorerBackend::triton-transducer-scorer-backend) diff --git a/triton/transducer-scorer-backend/src/bls.cc b/triton/transducer-scorer-backend/src/bls.cc new file mode 100644 index 000000000..8a9e72051 --- /dev/null +++ b/triton/transducer-scorer-backend/src/bls.cc @@ -0,0 +1,202 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "bls.h" + +namespace triton { +namespace backend { +namespace scorer { + +BLSExecutor::BLSExecutor(TRITONSERVER_Server* server) + : server_(server), model_executor_(server) {} + +TRITONSERVER_Error* BLSExecutor::PrepareInferenceRequest( + TRITONSERVER_InferenceRequest** irequest, const std::string& model_name) { + // Create an inference request object. The inference request object + // is where we set the name of the model we want to use for + // inference and the input tensors. + RETURN_IF_ERROR(TRITONSERVER_InferenceRequestNew( + irequest, server_, model_name.c_str(), -1 /* model_version */)); + + RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( + *irequest, InferRequestComplete, nullptr /* request_release_userp */)); + + return nullptr; // success +} + +TRITONSERVER_Error* BLSExecutor::PrepareInferenceInput( + const std::vector& input_tensors, + const std::vector& input_names, + TRITONSERVER_InferenceRequest* irequest) { + size_t input_count; + input_count = input_tensors.size(); + + const char* name; + TRITONSERVER_DataType datatype = TRITONSERVER_TYPE_FP16; + + TRITONSERVER_MemoryType data_memory_type = TRITONSERVER_MEMORY_GPU; + int64_t data_memory_id = 0; // TODO: get from config + + for (size_t count = 0; count < input_count; count++) { + name = input_names[count]; + if (std::strcmp(name, "y") == 0) { + // FIX ME, hard-code for decoder + datatype = TRITONSERVER_TYPE_INT64; + } + std::vector input_shapes; + auto shape = input_tensors[count].sizes(); + input_shapes.reserve(shape.size()); + for (auto itr = shape.begin(); itr != shape.end(); itr++) { + input_shapes.push_back(*itr); + } + uint32_t dims_count = (uint32_t)input_shapes.size(); + + const char* data_buffer = + reinterpret_cast(input_tensors[count].data_ptr()); + size_t data_byte_size = input_tensors[count].nbytes(); + + RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAddInput( + irequest, name, datatype, &input_shapes[0], dims_count)); + + RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAppendInputData( + irequest, name, data_buffer, data_byte_size, data_memory_type, + data_memory_id)); + } + + return nullptr; // success +} + +TRITONSERVER_Error* BLSExecutor::PrepareInferenceOutput( + const std::vector& output_names, + TRITONSERVER_InferenceRequest* irequest) { + // Indicate the output tensors to be calculated and returned + // for the inference request. + + for (const auto& output_name : output_names) { + RETURN_IF_ERROR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output_name)); + } + + return nullptr; // success +} + +torch::Tensor BLSExecutor::Execute(std::vector& input_tensors, + std::vector& input_names, + std::vector& output_names, + std::string model_name) { + // Check if both models are valid before executing request. + // Check if the model is ready. + bool is_ready = false; + THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady( + server_, model_name.c_str(), -1 /* model_version */, &is_ready)); + if (!is_ready) { + throw BLSBackendException( + (std::string("Failed to execute the inference request. Model '") + + model_name.c_str() + "' is not ready.") + .c_str()); + } + + // Prepare std::future for model. + std::vector> futures(1); + + // The inference request object for sending internal requests. + TRITONSERVER_InferenceRequest* irequest = nullptr; + + try { + THROW_IF_TRITON_ERROR(PrepareInferenceRequest(&irequest, model_name)); + THROW_IF_TRITON_ERROR( + PrepareInferenceInput(input_tensors, input_names, irequest)); + THROW_IF_TRITON_ERROR(PrepareInferenceOutput(output_names, irequest)); + + // Execute inference request. + THROW_IF_TRITON_ERROR(model_executor_.AsyncExecute(irequest, &futures[0])); + } catch (const BLSBackendException& bls_exception) { + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); + LOG_IF_ERROR(TRITONSERVER_InferenceRequestDelete(irequest), + "Failed to delete inference request."); + } + + // If both internal requests are sent successfully, retrieve the output from + // each request and construct the final response. + torch::Tensor r = ConstructFinalResponse(std::move(futures)); + return r; +} + +torch::Tensor BLSExecutor::ConstructFinalResponse( + std::vector> futures) { + std::vector completed_responses = {nullptr}; + + const char* output_name; + TRITONSERVER_DataType output_datatype; + const int64_t* output_shape; + uint64_t dims_count; + size_t output_byte_size; + TRITONSERVER_MemoryType output_memory_type; + int64_t output_memory_id; + const void* output_base; + void* userp; + size_t icount = 0; + // Retrieve the corresponding TRITONSERVER_InferenceResponse object from + // 'futures'. The InferResponseComplete function sets the std::promise + // so that this thread will block until the response is returned. + completed_responses[icount] = futures[icount].get(); + try { + THROW_IF_TRITON_ERROR( + TRITONSERVER_InferenceResponseError(completed_responses[icount])); + } catch (const BLSBackendException& bls_exception) { + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); + + if (completed_responses[icount] != nullptr) { + LOG_IF_ERROR( + TRITONSERVER_InferenceResponseDelete(completed_responses[icount]), + "Failed to delete inference response."); + } + } + // Retrieve outputs from 'completed_responses'. + + TRITONSERVER_InferenceResponseOutput( + completed_responses[icount], icount, &output_name, &output_datatype, + &output_shape, &dims_count, &output_base, &output_byte_size, + &output_memory_type, &output_memory_id, &userp); + + // TODO: FIX ME, currently put all tensors on cpu. + auto updated_options = + torch::TensorOptions().dtype(torch::kHalf).device(torch::kCPU); + + std::vector batchn_shape(output_shape, output_shape + dims_count); + torch::Tensor output_tensor = torch::from_blob(const_cast(output_base), + batchn_shape, updated_options) + .clone(); + + LOG_IF_ERROR( + TRITONSERVER_InferenceResponseDelete(completed_responses[icount]), + "Failed to delete inference response."); + return output_tensor; +} + +} // namespace scorer +} // namespace backend +} // namespace triton diff --git a/triton/transducer-scorer-backend/src/bls.h b/triton/transducer-scorer-backend/src/bls.h new file mode 100644 index 000000000..6727531eb --- /dev/null +++ b/triton/transducer-scorer-backend/src/bls.h @@ -0,0 +1,87 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "bls_utils.h" +#include "torch/script.h" +#include "triton/backend/backend_common.h" +#include "triton/core/tritonbackend.h" +#include "triton/core/tritonserver.h" + +namespace triton { +namespace backend { +namespace scorer { + +// +// BLSExecutor +// +// Includes the custom BLS logic for this backend. +// This class shows how to utilize Triton in-process C-API to build the +// execution pipeline. +// It does not take ownership of the server. +// +class BLSExecutor { + public: + explicit BLSExecutor(TRITONSERVER_Server* server); + + // Prepares the inference request that will be used internally. + TRITONSERVER_Error* PrepareInferenceRequest( + TRITONSERVER_InferenceRequest** irequest, const std::string& model_name); + + // Prepares the input for the internal inference request. + TRITONSERVER_Error* PrepareInferenceInput( + const std::vector& input_tensors, + const std::vector& input_names, + TRITONSERVER_InferenceRequest* irequest); + + // Prepares the output for the internal inference request. + TRITONSERVER_Error* PrepareInferenceOutput( + const std::vector& output_names, + TRITONSERVER_InferenceRequest* irequest); + + // Performs the whole BLS pipeline. + torch::Tensor Execute(std::vector& input_tensors, + std::vector& input_names, + std::vector& output_names, + std::string model_name); + + // Constructs the final response. + torch::Tensor ConstructFinalResponse( + std::vector> futures); + + private: + // The server object that encapsulates all the functionality of the Triton + // server and allows access to the Triton server API. + TRITONSERVER_Server* server_; + + // The ModelExecutor object for executing inference request on a model. + ModelExecutor model_executor_; +}; + +} // namespace scorer +} // namespace backend +} // namespace triton diff --git a/triton/transducer-scorer-backend/src/bls_utils.cc b/triton/transducer-scorer-backend/src/bls_utils.cc new file mode 100644 index 000000000..c2a173ed7 --- /dev/null +++ b/triton/transducer-scorer-backend/src/bls_utils.cc @@ -0,0 +1,208 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "bls_utils.h" + +namespace triton { +namespace backend { +namespace scorer { + +TRITONSERVER_Error* ResponseAlloc(TRITONSERVER_ResponseAllocator* allocator, + const char* tensor_name, size_t byte_size, + TRITONSERVER_MemoryType preferred_memory_type, + int64_t preferred_memory_type_id, void* userp, + void** buffer, void** buffer_userp, + TRITONSERVER_MemoryType* actual_memory_type, + int64_t* actual_memory_type_id) { + auto allocate_start_time = std::chrono::system_clock::now(); + + // Initially attempt to make the actual memory type and id that we allocate be + // the same as preferred memory type + *actual_memory_type = preferred_memory_type; + *actual_memory_type_id = preferred_memory_type_id; + + // If 'byte_size' is zero just return 'buffer' == nullptr, we don't need to do + // any other book-keeping. + if (byte_size == 0) { + *buffer = nullptr; + *buffer_userp = nullptr; + } else { + void* allocated_ptr = nullptr; + + switch (*actual_memory_type) { + case TRITONSERVER_MEMORY_CPU_PINNED: { + auto err = + cudaHostAlloc(&allocated_ptr, byte_size, cudaHostAllocPortable); + if (err != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string("cudaHostAlloc failed: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + + break; + } + case TRITONSERVER_MEMORY_GPU: { + auto err = cudaMalloc(&allocated_ptr, byte_size); + if (err != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string("cudaMalloc failed: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + + break; + } + // Use CPU memory if the requested memory type is unknown (default case). + case TRITONSERVER_MEMORY_CPU: + default: { + *actual_memory_type = TRITONSERVER_MEMORY_CPU; + allocated_ptr = malloc(byte_size); + break; + } + } + + // Pass the tensor name with buffer_userp so we can show it when releasing + // the buffer. + if (allocated_ptr != nullptr) { + *buffer = allocated_ptr; + *buffer_userp = new std::string(tensor_name); + } + } + + auto allocate_end_time = std::chrono::system_clock::now(); + std::chrono::duration allocate_dur_seconds = + allocate_end_time - allocate_start_time; + + return nullptr; // Success +} + +TRITONSERVER_Error* ResponseRelease(TRITONSERVER_ResponseAllocator* allocator, + void* buffer, void* buffer_userp, + size_t byte_size, + TRITONSERVER_MemoryType memory_type, + int64_t memory_type_id) { + std::string* name = nullptr; + if (buffer_userp != nullptr) { + name = reinterpret_cast(buffer_userp); + } else { + name = new std::string(""); + } + + std::stringstream ss; + ss << buffer; + std::string buffer_str = ss.str(); + + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + ("Releasing buffer " + buffer_str + " of size " + + std::to_string(byte_size) + " in " + + TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + *name) + .c_str()); + + switch (memory_type) { + case TRITONSERVER_MEMORY_CPU: + free(buffer); + break; + default: + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + std::string( + "error: unexpected buffer allocated in CUDA managed memory") + .c_str()); + break; + } + + delete name; + + return nullptr; // Success +} + +void InferRequestComplete(TRITONSERVER_InferenceRequest* request, + const uint32_t flags, void* userp) { + if (request != nullptr) { + LOG_IF_ERROR(TRITONSERVER_InferenceRequestDelete(request), + "Failed to delete inference request."); + } +} + +void InferResponseComplete(TRITONSERVER_InferenceResponse* response, + const uint32_t flags, void* userp) { + // The following logic only works for non-decoupled models as for decoupled + // models it may send multiple responses for a request or not send any + // responses for a request. Need to modify this function if the model is using + // decoupled API. + if (response != nullptr) { + // Send 'response' to the future. + std::promise* p = + reinterpret_cast*>(userp); + p->set_value(response); + delete p; + } +} + +ModelExecutor::ModelExecutor(TRITONSERVER_Server* server) : server_(server) { + // When triton needs a buffer to hold an output tensor, it will ask + // us to provide the buffer. In this way we can have any buffer + // management and sharing strategy that we want. To communicate to + // triton the functions that we want it to call to perform the + // allocations, we create a "response allocator" object. We pass + // this response allocate object to triton when requesting + // inference. We can reuse this response allocator object for any + // number of inference requests. + allocator_ = nullptr; + // THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew( + // &allocator_, CPUAllocator, ResponseRelease, nullptr /* start_fn */)); + THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew( + &allocator_, ResponseAlloc, ResponseRelease, nullptr /* start_fn */)); +} + +TRITONSERVER_Error* ModelExecutor::AsyncExecute( + TRITONSERVER_InferenceRequest* irequest, + std::future* future) { + // Perform inference by calling TRITONSERVER_ServerInferAsync. This + // call is asychronous and therefore returns immediately. The + // completion of the inference and delivery of the response is done + // by triton by calling the "response complete" callback functions + // (InferResponseComplete in this case). + auto p = new std::promise(); + *future = p->get_future(); + + RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator_, nullptr /* response_allocator_userp */, + InferResponseComplete, reinterpret_cast(p))); + + RETURN_IF_ERROR( + TRITONSERVER_ServerInferAsync(server_, irequest, nullptr /* trace */)); + + return nullptr; // success +} + +} // namespace scorer +} // namespace backend +} // namespace triton diff --git a/triton/transducer-scorer-backend/src/bls_utils.h b/triton/transducer-scorer-backend/src/bls_utils.h new file mode 100644 index 000000000..390688184 --- /dev/null +++ b/triton/transducer-scorer-backend/src/bls_utils.h @@ -0,0 +1,103 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "triton/backend/backend_common.h" +#include "triton/core/tritonbackend.h" +#include "triton/core/tritonserver.h" + +namespace triton { +namespace backend { +namespace scorer { + +#define THROW_IF_TRITON_ERROR(X) \ + do { \ + TRITONSERVER_Error* tie_err__ = (X); \ + if (tie_err__ != nullptr) { \ + throw BLSBackendException(TRITONSERVER_ErrorMessage(tie_err__)); \ + } \ + } while (false) + +// +// BLSBackendException +// +// Exception thrown if error occurs in BLSBackend. +// +struct BLSBackendException : std::exception { + BLSBackendException(const std::string& message) : message_(message) {} + + const char* what() const throw() { return message_.c_str(); } + + std::string message_; +}; + +TRITONSERVER_Error* ResponseAlloc(TRITONSERVER_ResponseAllocator* allocator, + const char* tensor_name, size_t byte_size, + TRITONSERVER_MemoryType preferred_memory_type, + int64_t preferred_memory_type_id, void* userp, + void** buffer, void** buffer_userp, + TRITONSERVER_MemoryType* actual_memory_type, + int64_t* actual_memory_type_id); + +// Callback functions for server inference. +TRITONSERVER_Error* ResponseRelease(TRITONSERVER_ResponseAllocator* allocator, + void* buffer, void* buffer_userp, + size_t byte_size, + TRITONSERVER_MemoryType memory_type, + int64_t memory_type_id); +void InferRequestComplete(TRITONSERVER_InferenceRequest* request, + const uint32_t flags, void* userp); +void InferResponseComplete(TRITONSERVER_InferenceResponse* response, + const uint32_t flags, void* userp); + +// +// ModelExecutor +// +// Execute inference request on a model. +// +class ModelExecutor { + public: + ModelExecutor(TRITONSERVER_Server* server); + + // Performs async inference request. + TRITONSERVER_Error* AsyncExecute( + TRITONSERVER_InferenceRequest* irequest, + std::future* future); + + private: + // The server object that encapsulates all the functionality of the Triton + // server and allows access to the Triton server API. + TRITONSERVER_Server* server_; + + // The allocator object that will be used for allocating output tensors. + TRITONSERVER_ResponseAllocator* allocator_; +}; + +} // namespace scorer +} // namespace backend +} // namespace triton diff --git a/triton/transducer-scorer-backend/src/libtriton_scorer.ldscript b/triton/transducer-scorer-backend/src/libtriton_scorer.ldscript new file mode 100644 index 000000000..5e12be0b4 --- /dev/null +++ b/triton/transducer-scorer-backend/src/libtriton_scorer.ldscript @@ -0,0 +1,30 @@ +#Copyright 2021, NVIDIA CORPORATION &AFFILIATES.All rights reserved. +# +#Redistribution and use in source and binary forms, with or without +#modification, are permitted provided that the following conditions +#are met: +#* Redistributions of source code must retain the above copyright +#notice, this list of conditions and the following disclaimer. +#* Redistributions in binary form must reproduce the above copyright +#notice, this list of conditions and the following disclaimer in the +#documentation and / or other materials provided with the distribution. +#* Neither the name of NVIDIA CORPORATION nor the names of its +#contributors may be used to endorse or promote products derived +#from this software without specific prior written permission. +# +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +{ + global: + TRITONBACKEND_*; + local: *; +}; diff --git a/triton/transducer-scorer-backend/src/scorer.cc b/triton/transducer-scorer-backend/src/scorer.cc new file mode 100644 index 000000000..9f0859ba1 --- /dev/null +++ b/triton/transducer-scorer-backend/src/scorer.cc @@ -0,0 +1,825 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "bls.h" +#include "scorer_utils.h" +#include "symbol-table.h" +#include "torch/all.h" +#include "torch/script.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +namespace triton { +namespace backend { +namespace scorer { + +struct ModelParams { + std::string decoding_method; + std::string tokenizer_file; + int context_size; +}; + +///////////// + +// +// ModelState +// +// State associated with a model that is using this backend. An object +// of this class is created and associated with each +// TRITONBACKEND_Model. ModelState is derived from BackendModel class +// provided in the backend utilities that provides many common +// functions. +// +class ModelState : public BackendModel { + public: + static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, + ModelState** state); + virtual ~ModelState() = default; + + // Validate and parse the model configuration + TRITONSERVER_Error* ValidateModelConfig(); + + // Obtain the parameters parsed from the model configuration + const ModelParams* Parameters() { return &model_params_; } + const sherpa::SymbolTable* getSymbolTable() { return &symbol_table_; } + + private: + ModelState(TRITONBACKEND_Model* triton_model); + ModelParams model_params_; + sherpa::SymbolTable symbol_table_; +}; + +ModelState::ModelState(TRITONBACKEND_Model* triton_model) + : BackendModel(triton_model) { + THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig()); + symbol_table_ = sherpa::SymbolTable(model_params_.tokenizer_file); +} + +TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, + ModelState** state) { + try { + *state = new ModelState(triton_model); + } catch (const BackendModelException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +TRITONSERVER_Error* ModelState::ValidateModelConfig() { + if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) { + common::TritonJson::WriteBuffer buffer; + RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer)); + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("model configuration:\n") + buffer.Contents()).c_str()); + } + + // ModelConfig is the model configuration as a TritonJson + // object. Use the TritonJson utilities to parse the JSON and + // determine if the configuration is supported by this backend. + common::TritonJson::Value inputs, outputs; + RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &inputs)); + RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &outputs)); + + // The model must have exactly 1 input and 1 output. + RETURN_ERROR_IF_FALSE(inputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG, + std::string("model configuration must have 2 input")); + RETURN_ERROR_IF_FALSE(outputs.ArraySize() == 1, + TRITONSERVER_ERROR_INVALID_ARG, + std::string("model configuration must have 1 output")); + + // Validate and set parameters + common::TritonJson::Value params; + RETURN_ERROR_IF_FALSE( + (ModelConfig().Find("parameters", ¶ms)), + TRITONSERVER_ERROR_INVALID_ARG, + std::string("missing parameters in the model configuration")); + RETURN_IF_ERROR( + ReadParameter(params, "context_size", &(model_params_.context_size))); + RETURN_IF_ERROR( + ReadParameter(params, "tokenizer_file", &(model_params_.tokenizer_file))); + RETURN_IF_ERROR(ReadParameter(params, "decoding_method", + &(model_params_.decoding_method))); + return nullptr; // success +} + +///////////// +// +// ModelInstanceState +// +// State associated with a model instance. An object of this class is +// created and associated with each +// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from +// BackendModelInstance class provided in the backend utilities that +// provides many common functions. +// +class ModelInstanceState : public BackendModelInstance { + public: + static TRITONSERVER_Error* Create( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state); + virtual ~ModelInstanceState() = default; + + // Get the state of the model that corresponds to this instance. + ModelState* StateForModel() const { return model_state_; } + void ProcessRequests(TRITONBACKEND_Request** requests, + const uint32_t request_count); + + private: + ModelInstanceState(ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance) + : BackendModelInstance(model_state, triton_model_instance), + model_state_(model_state), + bls_executor_(model_state->TritonServer()), + device_(torch::kCPU) { +#ifdef TRITON_ENABLE_GPU + device_ = torch::Device(torch::kCUDA, DeviceId()); + // Need to set the CUDA context so that the context that events are + // created on match with contexts that events are recorded with. + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaSetDevice(DeviceId()), TRITONSERVER_ERROR_INTERNAL, + "Failed to set the device")); + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaEventCreate(&compute_input_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaEventCreate(&compute_infer_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaEventCreate(&compute_output_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); +#endif + // TODO: FIX this hard code + input_index_map_["encoder_out"] = 0; + input_index_map_["encoder_out_lens"] = 1; + } + + TRITONSERVER_Error* SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, + std::vector* input_memories, bool* cuda_copy); + void SetOutputBuffer(const std::string& out_bytes, + TRITONBACKEND_Response* response, + TRITONBACKEND_Output* response_output); + TRITONSERVER_Error* RecordBackendTimestamp(uint64_t* timestamp, + void* cuda_event); + std::vector> Search( + std::vector* input_tensors); + + ModelState* model_state_; + BLSExecutor bls_executor_; + torch::Device device_; + std::unordered_map input_index_map_; + + cudaEvent_t compute_input_start_event_; + cudaEvent_t compute_infer_start_event_; + cudaEvent_t compute_output_start_event_; +}; + +TRITONSERVER_Error* ModelInstanceState::Create( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state) { + try { + *state = new ModelInstanceState(model_state, triton_model_instance); + } catch (const BackendModelInstanceException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelInstanceException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +TRITONSERVER_Error* ModelInstanceState::RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event) { + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); + RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( + cudaEventRecord(*lcuda_event, stream_), TRITONSERVER_ERROR_INTERNAL, + "Failed to record the event.")); +#endif + } else { + SET_TIMESTAMP(*timestamp); + } + return nullptr; +} + +void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, + const uint32_t request_count) { + const int max_batch_size = model_state_->MaxBatchSize(); + + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + + // For each request collect the total batch size for this inference + // execution. The batch-size, number of inputs, and size of each + // input has already been checked so don't need to do that here. + size_t total_batch_size = 0; + for (size_t i = 0; i < request_count; i++) { + // If we get a nullptr request then something is badly wrong. Fail + // and release all requests. + if (requests[i] == nullptr) { + RequestsRespondWithError( + requests, request_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string("null request given to PyTorch backend for '" + + Name() + "'") + .c_str())); + return; + } + } + + // At this point we are committed to running inference with all + // 'requests'. Create a response for each request. During input + // processing if there is an error with any request that error will + // be sent immediately with the corresponding response (and the + // response unique_ptr will then be nullptr). The request object + // itself will not be released until after all inferencing is done + // (below) as we may need to access the request object when + // determine how to process outputs (for example, even if we don't + // need the outputs for a request that has an error, we do need to + // know the size of those outputs associated with the request so we + // can skip them in the output tensors). + std::vector responses; + responses.reserve(request_count); + bool all_response_failed = false; + + for (size_t i = 0; i < request_count; i++) { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses.emplace_back(response); + } else { + responses.emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + + for (size_t i = 0; i < request_count; i++) { + if (max_batch_size > 0) { + // Retrieve the batch size from one of the inputs, if the model + // supports batching, the first dimension size is batch size. + TRITONBACKEND_Input* input; + TRITONSERVER_Error* err = + TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); + if (err == nullptr) { + const int64_t* shape; + err = TRITONBACKEND_InputProperties(input, nullptr, nullptr, &shape, + nullptr, nullptr, nullptr); + total_batch_size += shape[0]; + } + if (err != nullptr) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR(responses, request_count, + all_response_failed, err); + } + } else { + total_batch_size += 1; + } + } + + // If there are no valid payloads then no need to run the inference. + if (total_batch_size == 0) { + return; + } + + // Make sure the maximum batch size is not exceeded. The + // total_batch_size must be 1 for models that don't support batching + // (i.e. max_batch_size == 0). If max_batch_size is exceeded then + // scheduler has done something badly wrong so fail and release all + // requests. + if (!all_response_failed) { + if ((total_batch_size != 1) && + (total_batch_size > (size_t)max_batch_size)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string("batch size " + std::to_string(total_batch_size) + + " for '" + Name() + "', max allowed is " + + std::to_string(max_batch_size)) + .c_str())); + } + } + + std::vector input_names; + std::vector input_tensors; + std::vector input_memories; + bool cuda_copy = false; + std::unique_ptr collector; + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventRecord(compute_input_start_event_, stream_), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } + + if (!all_response_failed) { + collector.reset(new BackendInputCollector( + requests, request_count, &responses, + model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), + CudaStream(), nullptr, nullptr, 0, HostPolicyName().c_str())); + + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + SetInputTensors(total_batch_size, requests, request_count, &responses, + collector.get(), &input_names, &input_tensors, + &input_memories, &cuda_copy)); + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream_); + cuda_copy = false; + } +#endif + + uint64_t compute_start_ns = 0; + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_start_ns, + reinterpret_cast(&compute_infer_start_event_))); + + // Run... + std::vector> ans; + + if (!all_response_failed) { + ans = Search(&input_tensors); + } + + std::vector ans_str; + const sherpa::SymbolTable* symbol_table = model_state_->getSymbolTable(); + + for (auto& utt : ans) { + ans_str.push_back(Convert(utt, symbol_table)); + } + + uint64_t compute_end_ns = 0; + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_end_ns, + reinterpret_cast(&compute_output_start_event_))); + + std::vector output_shape{1, 1}; + int dims_count = 2; + int i = 0; + for (auto& response : responses) { + if (response != nullptr) { + TRITONBACKEND_Output* response_output; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, + TRITONBACKEND_ResponseOutput(response, &response_output, "OUTPUT0", + TRITONSERVER_TYPE_BYTES, + &output_shape[0], dims_count)); + SetOutputBuffer(ans_str[i], response, response_output); + + LOG_IF_ERROR(TRITONBACKEND_ResponseSend( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), + "failed to send BLS backend response"); + } + i++; + } + + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + +#ifdef TRITON_ENABLE_GPU + // We have to always synchronize the stream. This is to make sure that + // the events on the cuda stream are synchronized. Otherwise, the events + // are only guaranteed to be synchronized if the model provides the output + // on GPU. + cudaStreamSynchronize(stream_); +#endif + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + // [FIXME] in the case of cudaEventElapsedTime failure, should handle + // stats reporting more gracefully as the durations are inaccurate + float compute_input_duration = 0; + float compute_infer_duration = 0; + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime(&compute_input_duration, + compute_input_start_event_, + compute_infer_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); + + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime(&compute_infer_duration, + compute_infer_start_event_, + compute_output_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); +#endif + } + + // Report statistics for each request. + for (uint32_t r = 0; r < request_count; ++r) { + auto& request = requests[r]; + LOG_IF_ERROR(TRITONBACKEND_ModelInstanceReportStatistics( + TritonModelInstance(), request, + (responses[r] != nullptr) /* success */, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting request statistics"); + + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), + "failed releasing request"); + } + + // Report the entire batch statistics. + LOG_IF_ERROR(TRITONBACKEND_ModelInstanceReportBatchStatistics( + TritonModelInstance(), total_batch_size, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting batch request statistics"); + + LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, + (std::string("TRITONBACKEND_ModelExecute: model ") + Name() + + " released " + std::to_string(request_count) + " requests") + .c_str()); +} + +void ModelInstanceState::SetOutputBuffer( + const std::string& out_bytes, TRITONBACKEND_Response* response, + TRITONBACKEND_Output* response_output) { + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t actual_memory_type_id = 0; + uint32_t byte_size_with_size_int = out_bytes.size() + sizeof(int32_t); + void* obuffer; + auto err = TRITONBACKEND_OutputBuffer( + response_output, &obuffer, byte_size_with_size_int, &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(&response, err); + } + + int32_t* buffer_as_int = reinterpret_cast(obuffer); + buffer_as_int[0] = out_bytes.size(); + memcpy(&buffer_as_int[1], out_bytes.data(), out_bytes.size()); +} + +///////////// +std::vector> ModelInstanceState::Search( + std::vector* input_tensors) { + NVTX_RANGE(nvtx_, "greedy search " + Name()); + torch::Tensor encoder_out, encoder_out_length; + encoder_out = (*input_tensors)[0].toTensor(); + encoder_out_length = (*input_tensors)[1].toTensor(); + encoder_out_length = encoder_out_length.to(torch::kCPU); + + TORCH_CHECK(encoder_out.dim() == 3, "encoder_out.dim() is ", + encoder_out.dim(), "Expected value is 3"); + // Only support fp16 for now + TORCH_CHECK(encoder_out.scalar_type() == torch::kHalf, + "encoder_out.scalar_type() is ", encoder_out.scalar_type()); + + TORCH_CHECK(encoder_out_length.dim() == 1, "encoder_out_length.dim() is", + encoder_out_length.dim()); + TORCH_CHECK(encoder_out_length.scalar_type() == torch::kLong, + "encoder_out_length.scalar_type() is ", + encoder_out_length.scalar_type()); + + TORCH_CHECK(encoder_out_length.device().is_cpu()); + + torch::nn::utils::rnn::PackedSequence packed_seq = + torch::nn::utils::rnn::pack_padded_sequence(encoder_out, + encoder_out_length, + /*batch_first*/ true, + /*enforce_sorted*/ false); + + int32_t blank_id = 0; // hard-code for now , TOOD: yuekai + int32_t context_size = model_state_->Parameters()->context_size; + + int32_t N = encoder_out_length.size(0); + + std::vector padding(context_size, blank_id); + std::vector> results(N, padding); + + auto decoder_input = + torch::full({N, context_size}, blank_id, + torch::dtype(torch::kLong) + .memory_format(torch::MemoryFormat::Contiguous)); + + std::string decoder_name = "decoder"; + std::vector decoder_input_name{"y"}; + std::vector decoder_output_name{"decoder_out"}; + std::vector decoder_input_tensors{decoder_input.to(device_)}; + + auto decoder_out = + bls_executor_.Execute(decoder_input_tensors, decoder_input_name, + decoder_output_name, decoder_name); + + std::string joiner_name = "joiner"; + std::vector joiner_input_name{"encoder_out", "decoder_out"}; + std::vector joiner_output_name{"logit"}; + + using torch::indexing::Slice; + auto batch_sizes_accessor = packed_seq.batch_sizes().accessor(); + + int32_t max_T = packed_seq.batch_sizes().numel(); + + int32_t offset = 0; + for (int32_t t = 0; t != max_T; ++t) { + int32_t cur_batch_size = batch_sizes_accessor[t]; + int32_t start = offset; + int32_t end = start + cur_batch_size; + auto cur_encoder_out = packed_seq.data().index({Slice(start, end)}); + // Now cur_encoder_out is of shape (cur_batch_size, joiner_dim) + offset = end; + + if (cur_batch_size < decoder_out.size(0)) { + decoder_out = decoder_out.index({Slice(0, cur_batch_size)}); + } + std::vector joiner_input_tensors{ + cur_encoder_out, decoder_out.squeeze(1).to(device_)}; + + auto logits = bls_executor_.Execute(joiner_input_tensors, joiner_input_name, + joiner_output_name, joiner_name); + + // logits' shape is (cur_batch_size, vocab_size) + // logits is the output of nn.Linear. Since we are using greedy search + // and only the magnitude matters, we don't invoke log_softmax here + auto max_indices = logits.argmax(/*dim*/ -1).cpu(); + auto max_indices_accessor = max_indices.accessor(); + bool emitted = false; + for (int32_t k = 0; k != cur_batch_size; ++k) { + auto index = max_indices_accessor[k]; + if (index != blank_id) { + emitted = true; + results[k].push_back(index); + // TODO: add timestamps here + // results[k].tokens.push_back(index); + // results[k].timestamps.push_back(t); + } + } + + if (emitted) { + BuildDecoderInput(results, &decoder_input); + std::vector decoder_input_tensors{ + decoder_input.to(device_)}; + decoder_out = + bls_executor_.Execute(decoder_input_tensors, decoder_input_name, + decoder_output_name, decoder_name); + } + } // for (int32_t t = 0; t != max_T; ++t) { + + auto unsorted_indices = packed_seq.unsorted_indices().cpu(); + auto unsorted_indices_accessor = unsorted_indices.accessor(); + + // std::vector ans(N); + std::vector> ans(N); + + for (int32_t i = 0; i != N; ++i) { + int32_t k = unsorted_indices_accessor[i]; + torch::ArrayRef arr(results[k]); + // torch::ArrayRef arr(results[k].tokens); + ans[i] = arr.slice(context_size).vec(); + // ans[i].tokens = arr.slice(context_size).vec(); + // ans[i].timestamps = std::move(results[k].timestamps); + } + + return ans; +} + +TRITONSERVER_Error* ModelInstanceState::SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, + std::vector* input_memories, bool* cuda_copy) { + // All requests must have equally-sized input tensors so use any + // request as the representative for the input tensors. + + uint32_t input_count; + + RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); + input_tensors->resize(input_count); + + for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { + TRITONBACKEND_Input* input; + RETURN_IF_ERROR( + TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); + + const char* input_name; + TRITONSERVER_DataType input_datatype; + const int64_t* input_shape; + uint32_t input_dims_count; + RETURN_IF_ERROR(TRITONBACKEND_InputProperties( + input, &input_name, &input_datatype, &input_shape, &input_dims_count, + nullptr, nullptr)); + + input_names->emplace_back(input_name); + + // The shape for the entire input patch, [total_batch_size, ...] + std::vector batchn_shape(input_shape, + input_shape + input_dims_count); + + batchn_shape[0] = total_batch_size; + + // The input must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + if (device_.is_cpu()) { + alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, + {TRITONSERVER_MEMORY_CPU, 0}}; + LOG_MESSAGE(TRITONSERVER_LOG_INFO, + (std::string("device is cpu")).c_str()); + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + const char* input_buffer; + size_t batchn_byte_size; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + + RETURN_IF_ERROR(collector->ProcessTensor( + input_name, nullptr, 0, alloc_perference, &input_buffer, + &batchn_byte_size, &memory_type, &memory_type_id)); + + // Create Torch tensor + const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + torch::Tensor input_tensor = torch::from_blob( + const_cast(input_buffer), batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + + // Finalize... + *cuda_copy |= collector->Finalize(); + + return nullptr; +} + +extern "C" { + +// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded +// to allow the backend to create any state associated with the model, +// and to also examine the model configuration to determine if the +// configuration is suitable for the backend. Any errors reported by +// this function will prevent the model from loading. +// +TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { + const char* cname; + RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); + std::string name(cname); + + uint64_t version; + RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); + + LOG_MESSAGE(TRITONSERVER_LOG_INFO, + (std::string("TRITONBACKEND_ModelInitialize: ") + name + + " (version " + std::to_string(version) + ")") + .c_str()); + // Create a ModelState object and associate it with the + // TRITONBACKEND_Model. If anything goes wrong with initialization + // of the model state then an error is returned and Triton will fail + // to load the model. + ModelState* model_state; + RETURN_IF_ERROR(ModelState::Create(model, &model_state)); + RETURN_IF_ERROR( + TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); + + return nullptr; // success +} + +// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer +// needed. The backend should cleanup any state associated with the +// model. This function will not be called until all model instances +// of the model have been finalized. +// +TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { + void* vstate; + RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); + ModelState* model_state = reinterpret_cast(vstate); + delete model_state; + + return nullptr; // success +} + +} // extern "C" + +extern "C" { + +// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model +// instance is created to allow the backend to initialize any state +// associated with the instance. +// +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize( + TRITONBACKEND_ModelInstance* instance) { + const char* cname; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname)); + std::string name(cname); + + // Get the model state associated with this instance's model. + TRITONBACKEND_Model* model; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); + + int32_t device_id; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); + + TRITONSERVER_InstanceGroupKind kind; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind)); + + LOG_MESSAGE(TRITONSERVER_LOG_INFO, + (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + + " (" + TRITONSERVER_InstanceGroupKindString(kind) + " device " + + std::to_string(device_id) + ")") + .c_str()); + + void* vmodelstate; + RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); + ModelState* model_state = reinterpret_cast(vmodelstate); + + // Create a ModelInstanceState object and associate it with the + // TRITONBACKEND_ModelInstance. + ModelInstanceState* instance_state; + RETURN_IF_ERROR( + ModelInstanceState::Create(model_state, instance, &instance_state)); + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( + instance, reinterpret_cast(instance_state))); + + return nullptr; // success +} + +// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model +// instance is no longer needed. The backend should cleanup any state +// associated with the model instance. +// +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize( + TRITONBACKEND_ModelInstance* instance) { + void* vstate; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); + ModelInstanceState* instance_state = + reinterpret_cast(vstate); + delete instance_state; + + return nullptr; // success +} + +} // extern "C" + +///////////// + +extern "C" { + +// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required +// that a backend create a response for each request in the batch. A +// response may be the output tensors required for that request or may +// be an error that is returned in the response. +// +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count) { + // Triton will not call this function simultaneously for the same + // 'instance'. But since this backend could be used by multiple + // instances from multiple models the implementation needs to handle + // multiple calls to this function at the same time (with different + // 'instance' objects). Best practice for a high-performance + // implementation is to avoid introducing mutex/lock and instead use + // only function-local and model-instance-specific state. + ModelInstanceState* instance_state; + RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( + instance, reinterpret_cast(&instance_state))); + + instance_state->ProcessRequests(requests, request_count); + + return nullptr; // success +} + +} // extern "C" + +} // namespace scorer +} // namespace backend +} // namespace triton diff --git a/triton/transducer-scorer-backend/src/scorer_utils.h b/triton/transducer-scorer-backend/src/scorer_utils.h new file mode 100644 index 000000000..f0cf3f4d0 --- /dev/null +++ b/triton/transducer-scorer-backend/src/scorer_utils.h @@ -0,0 +1,121 @@ + +#pragma once + +#include + +#include "symbol-table.h" +#include "torch/script.h" + +using triton::common::TritonJson; + +namespace triton { +namespace backend { +namespace scorer { + +static std::string Convert(const std::vector& src, + const sherpa::SymbolTable* sym_table) { + std::string text; + for (auto i : src) { + auto sym = (*sym_table)[i]; + text.append(sym); + } + return text; +} + +static void BuildDecoderInput(const std::vector>& r, + torch::Tensor* decoder_input) { + int32_t batch_size = decoder_input->size(0); + int32_t context_size = decoder_input->size(1); + int64_t* p = decoder_input->data_ptr(); + for (int32_t i = 0; i != batch_size; ++i) { + auto start = r[i].end() - context_size; + auto end = r[i].end(); + std::copy(start, end, p); + p += context_size; + } +} + +std::pair ConvertDataTypeToTorchType( + const TRITONSERVER_DataType dtype) { + torch::ScalarType type = torch::kInt; + switch (dtype) { + case TRITONSERVER_TYPE_BOOL: + type = torch::kBool; + break; + case TRITONSERVER_TYPE_UINT8: + type = torch::kByte; + break; + case TRITONSERVER_TYPE_INT8: + type = torch::kChar; + break; + case TRITONSERVER_TYPE_INT16: + type = torch::kShort; + break; + case TRITONSERVER_TYPE_INT32: + type = torch::kInt; + break; + case TRITONSERVER_TYPE_INT64: + type = torch::kLong; + break; + case TRITONSERVER_TYPE_FP16: + type = torch::kHalf; + break; + case TRITONSERVER_TYPE_FP32: + type = torch::kFloat; + break; + case TRITONSERVER_TYPE_FP64: + type = torch::kDouble; + break; + case TRITONSERVER_TYPE_UINT16: + case TRITONSERVER_TYPE_UINT32: + case TRITONSERVER_TYPE_UINT64: + case TRITONSERVER_TYPE_BYTES: + default: + return std::make_pair(false, type); + } + + return std::make_pair(true, type); +} + +TRITONSERVER_Error* ReadParameter(TritonJson::Value& params, + const std::string& key, std::string* param) { + TritonJson::Value value; + RETURN_ERROR_IF_FALSE( + params.Find(key.c_str(), &value), TRITONSERVER_ERROR_INVALID_ARG, + std::string("model configuration is missing the parameter ") + key); + RETURN_IF_ERROR(value.MemberAsString("string_value", param)); + return nullptr; // success +} + +TRITONSERVER_Error* ReadParameter(TritonJson::Value& params, + const std::string& key, int* param) { + std::string tmp; + RETURN_IF_ERROR(ReadParameter(params, key, &tmp)); + *param = std::stoi(tmp); + return nullptr; // success +} + +TRITONSERVER_Error* ReadParameter(TritonJson::Value& params, + const std::string& key, float* param) { + std::string tmp; + RETURN_IF_ERROR(ReadParameter(params, key, &tmp)); + *param = std::stof(tmp); + return nullptr; // success +} + +#ifdef TRITON_ENABLE_GPU +TRITONSERVER_Error* ConvertCUDAStatusToTritonError(cudaError_t cuda_error, + TRITONSERVER_Error_Code code, + const char* msg) { + if (cuda_error != cudaSuccess) { + return TRITONSERVER_ErrorNew( + code, + (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str()); + } + return nullptr; // success +} +#endif + +} // namespace scorer +} // namespace backend +} // namespace triton \ No newline at end of file diff --git a/sherpa/csrc/symbol_table.cc b/triton/transducer-scorer-backend/src/symbol-table.cc similarity index 88% rename from sherpa/csrc/symbol_table.cc rename to triton/transducer-scorer-backend/src/symbol-table.cc index 822979986..9abb3649e 100644 --- a/sherpa/csrc/symbol_table.cc +++ b/triton/transducer-scorer-backend/src/symbol-table.cc @@ -16,13 +16,11 @@ * limitations under the License. */ -#include "sherpa/csrc/symbol_table.h" +#include "symbol-table.h" #include #include -#include "sherpa/csrc/log.h" - namespace sherpa { SymbolTable::SymbolTable(const std::string &filename) { @@ -39,14 +37,9 @@ SymbolTable::SymbolTable(const std::string &filename) { } } - SHERPA_CHECK(!sym.empty()); - SHERPA_CHECK_EQ(sym2id_.count(sym), 0) << "Duplicated symbol: " << sym; - SHERPA_CHECK_EQ(id2sym_.count(id), 0) << "Duplicated ID: " << id; - sym2id_.insert({sym, id}); id2sym_.insert({id, sym}); } - SHERPA_CHECK(is.eof()); } std::string SymbolTable::ToString() const { @@ -76,4 +69,12 @@ std::ostream &operator<<(std::ostream &os, const SymbolTable &symbol_table) { return os << symbol_table.ToString(); } +void SymbolTable::Replace(int32_t id, const std::string &new_sym, + const std::string &old_sym) { + sym2id_.erase(old_sym); + + id2sym_.at(id) = new_sym; + sym2id_[new_sym] = id; +} + } // namespace sherpa diff --git a/sherpa/csrc/symbol_table.h b/triton/transducer-scorer-backend/src/symbol-table.h similarity index 94% rename from sherpa/csrc/symbol_table.h rename to triton/transducer-scorer-backend/src/symbol-table.h index a03327f04..db6ab3293 100644 --- a/sherpa/csrc/symbol_table.h +++ b/triton/transducer-scorer-backend/src/symbol-table.h @@ -44,6 +44,10 @@ class SymbolTable { /// Return the ID corresponding to the given symbol. int32_t operator[](const std::string &sym) const; + // self[id] = sym + void Replace(int32_t id, const std::string &new_sym, + const std::string &old_sym); + /// Return true if there is a symbol with the given ID. bool contains(int32_t id) const; diff --git a/triton/whisper/Dockerfile.server b/triton/whisper/Dockerfile.server new file mode 100644 index 000000000..650e1039f --- /dev/null +++ b/triton/whisper/Dockerfile.server @@ -0,0 +1,10 @@ +FROM nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3 + +WORKDIR /workspace +COPY requirements.txt . +COPY prepare.sh . +COPY fill_template.py . +COPY convert_checkpoint.py . +COPY model_repo_whisper_trtllm model_repo_whisper_trtllm + +RUN python3 -m pip install -r requirements.txt \ No newline at end of file diff --git a/triton/whisper/README.md b/triton/whisper/README.md new file mode 100644 index 000000000..d646b9255 --- /dev/null +++ b/triton/whisper/README.md @@ -0,0 +1,150 @@ +## Triton Inference Serving Best Practice for Whisper TensorRT-LLM + +### Quick Start +Directly launch the service using docker compose. +```sh +# MODEL_IDs=("large-v3-turbo" "large-v3" "large-v2-turbo-multi-hans" "large-v2-multi-hans") +MODEL_ID=large-v3-turbo docker compose up +``` + +### Build Image +Build the docker image from scratch. +```sh +# build from scratch, cd to the parent dir of Dockerfile.server +docker build . -f Dockerfile.server -t soar97/triton-whisper:24.09 +``` + +### Create Docker Container +```sh +your_mount_dir=/mnt:/mnt +docker run -it --name "whisper-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-whisper:24.09 +``` + +### Export Whisper Model to TensorRT-LLM +Inside docker container, we would follow the official guide of TensorRT-LLM to build whisper TensorRT-LLM engines. See [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper). + +```sh +# We already have a clone of TensorRT-LLM inside container, so no need to clone it. +cd TensorRT-LLM/examples/whisper + +# take large-v3 model as an example +wget --directory-prefix=assets https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt + +INFERENCE_PRECISION=float16 +MAX_BEAM_WIDTH=4 +MAX_BATCH_SIZE=8 +checkpoint_dir=tllm_checkpoint +output_dir=whisper_large_v3 + +# Convert the large-v3 openai model into trtllm compatible checkpoint. +python3 convert_checkpoint.py \ + --output_dir $checkpoint_dir + +# Build the large-v3 trtllm engines +trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ + --output_dir ${output_dir}/encoder \ + --moe_plugin disable \ + --enable_xqa disable \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin disable \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --max_input_len 3000 --max_seq_len=3000 + + +trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ + --output_dir ${output_dir}/decoder \ + --moe_plugin disable \ + --enable_xqa disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --max_seq_len 114 \ + --max_input_len 14 \ + --max_encoder_input_len 3000 \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} + +# prepare the model_repo_whisper +cd sherpa/triton/whisper +model_repo=model_repo_whisper +rm -rf $model_repo +cp model_repo_whisper_trtllm $model_repo -r +wget --directory-prefix=$model_repo/infer_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken +wget --directory-prefix=$model_repo/whisper/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + +output_dir=/workspace/TensorRT-LLM/examples/whisper/whisper_turbo +n_mels=128 +zero_pad=false + +TRITON_MAX_BATCH_SIZE=64 +MAX_QUEUE_DELAY_MICROSECONDS=100 +python3 fill_template.py -i $model_repo/whisper/config.pbtxt engine_dir:${output_dir},n_mels:$n_mels,zero_pad:$zero_pad,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS} +python3 fill_template.py -i $model_repo/infer_bls/config.pbtxt engine_dir:${output_dir},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS} +``` +### Using Fine-tuned Whisper +Official whisper models only accept 30-second audios. To improve the throughput, you could fine-tune the whisper model to remove the 30 seconds restriction. See [examples](https://github.com/k2-fsa/icefall/blob/master/egs/aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py#L15). + +We prepared two [Chinese fine-tuned whisper](https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-without-datatang-200h-finetuning-results-on-whisper-large-v2) TensorRT-LLM weights repo. They could be directly used from [here.](https://huggingface.co/yuekai/whisper_multi_zh_tllm_checkpoint/tree/main) + +### Launch Server +Log of directory tree: +```sh +model_repo_whisper_trtllm +├── infer_bls +│   ├── 1 +│   │   ├── model.py +│   │   ├── multilingual.tiktoken +│   │   └── tokenizer.py +│   └── config.pbtxt +└── whisper + ├── 1 + │   ├── fbank.py + │   ├── mel_filters.npz + │   └── model.py + └── config.pbtxt + +4 directories, 8 files +``` +```sh +# launch the server +tritonserver --model-repository=$model_repo/ +``` + + + +### Benchmark using Dataset +```sh +git clone https://github.com/yuekaizhang/Triton-ASR-Client.git +cd Triton-ASR-Client +num_task=16 +dataset=aishell1_test +python3 client.py \ + --server-addr localhost \ + --model-name infer_bls \ + --num-tasks $num_task \ + --text-prompt "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \ + --manifest-dir ./datasets/$dataset \ + --log-dir ./log_sherpa_multi_hans_whisper_large_ifb_$num_task \ + --compute-cer +``` + + \ No newline at end of file diff --git a/triton/whisper/convert_checkpoint.py b/triton/whisper/convert_checkpoint.py new file mode 100644 index 000000000..201da945c --- /dev/null +++ b/triton/whisper/convert_checkpoint.py @@ -0,0 +1,426 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import json +import os +import time + +import numpy as np +import torch +from safetensors.torch import save_file + +import tensorrt_llm +from tensorrt_llm.functional import LayerNormPositionType, LayerNormType +from tensorrt_llm.models.convert_utils import weight_only_quantize_dict +from tensorrt_llm.quantization import QuantAlgo + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, default="assets/large-v3.pt") + parser.add_argument('--quant_ckpt_path', type=str, default=None) + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument('--logits_dtype', + type=str, + default='float16', + choices=['float16', 'float32']) + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + args = parser.parse_args() + return args + + +def get_encoder_config(model_metadata: dict, dtype: str, + quant_algo: QuantAlgo) -> dict: + model_is_multilingual = (model_metadata['n_vocab'] >= 51865) + num_languages = model_metadata['n_vocab'] - 51765 - int( + model_is_multilingual) + return { + 'architecture': "WhisperEncoder", + 'dtype': dtype, + 'num_hidden_layers': model_metadata['n_audio_layer'], + 'num_attention_heads': model_metadata['n_audio_head'], + 'hidden_size': model_metadata['n_audio_state'], + 'max_position_embeddings': model_metadata['n_audio_ctx'], + 'has_position_embedding': True, + 'n_mels': model_metadata['n_mels'], + 'vocab_size': model_metadata['n_vocab'], + 'hidden_act': "gelu", + 'num_languages': num_languages, + 'quantization': { + 'quant_algo': quant_algo + }, + } + + +def get_decoder_config(model_metadata: dict, dtype: str, logits_dtype: str, + quant_algo: QuantAlgo) -> dict: + return { + 'architecture': "DecoderModel", + 'dtype': dtype, + 'logits_dtype': logits_dtype, + 'num_hidden_layers': model_metadata['n_text_layer'], + 'num_attention_heads': model_metadata['n_text_head'], + 'hidden_size': model_metadata['n_text_state'], + 'norm_epsilon': 1e-5, + 'vocab_size': model_metadata['n_vocab'], + 'hidden_act': "gelu", + 'use_parallel_embedding': False, + 'embedding_sharding_dim': 0, + 'max_position_embeddings': model_metadata['n_text_ctx'], + 'use_prompt_tuning': False, + 'head_size': + model_metadata['n_text_state'] // model_metadata['n_text_head'], + 'has_position_embedding': True, + 'layernorm_type': LayerNormType.LayerNorm, + 'has_attention_qkvo_bias': True, + 'has_mlp_bias': True, + 'has_model_final_layernorm': True, + 'has_embedding_layernorm': False, + 'has_embedding_scale': False, + 'ffn_hidden_size': 4 * model_metadata['n_text_state'], + 'q_scaling': 1.0, + 'layernorm_position': LayerNormPositionType.pre_layernorm, + 'relative_attention': False, + 'max_distance': 0, + 'num_buckets': 0, + 'model_type': 'whisper', + 'rescale_before_lm_head': False, + 'encoder_hidden_size': model_metadata['n_text_state'], + 'encoder_num_heads': model_metadata['n_text_head'], + 'encoder_head_size': None, + 'skip_cross_kv': False, + 'quantization': { + 'quant_algo': quant_algo + }, + } + + +def convert_openai_whisper_encoder( + model_metadata: dict, + model_params: dict, + quant_algo: str = None, +): + weights = {} + + def sinusoids(length, channels, max_timescale=10000): + """Returns sinusoids for positional embedding""" + assert channels % 2 == 0 + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * + torch.arange(channels // 2)) + scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[ + np.newaxis, :] + return torch.cat([torch.sin(scaled_time), + torch.cos(scaled_time)], + dim=1) + + weights['position_embedding.weight'] = sinusoids( + model_metadata['n_audio_ctx'], + model_metadata['n_audio_state']).contiguous() + + weights['conv1.weight'] = torch.unsqueeze( + model_params['encoder.conv1.weight'], -1).contiguous() + weights['conv1.bias'] = model_params['encoder.conv1.bias'].contiguous() + weights['conv2.weight'] = torch.unsqueeze( + model_params['encoder.conv2.weight'], -1).contiguous() + weights['conv2.bias'] = model_params['encoder.conv2.bias'].contiguous() + + # Encoder conv needs to run in fp32 on Volta/Turing + major, minor = torch.cuda.get_device_capability() + if not major >= 8: + weights['conv1.weight'] = weights['conv1.weight'].float() + weights['conv1.bias'] = weights['conv1.bias'].float() + weights['conv2.weight'] = weights['conv2.weight'].float() + weights['conv2.bias'] = weights['conv2.bias'].float() + + for i in range(model_metadata['n_audio_layer']): + weights[ + f'encoder_layers.{i}.attention_layernorm.weight'] = model_params[ + 'encoder.blocks.' + str(i) + '.attn_ln.weight'].contiguous() + weights[f'encoder_layers.{i}.attention_layernorm.bias'] = model_params[ + 'encoder.blocks.' + str(i) + '.attn_ln.bias'].contiguous() + + t = torch.cat([ + model_params['encoder.blocks.' + str(i) + '.attn.query.weight'], + model_params['encoder.blocks.' + str(i) + '.attn.key.weight'], + model_params['encoder.blocks.' + str(i) + '.attn.value.weight'] + ], + dim=0).contiguous() + + weights[f'encoder_layers.{i}.attention.qkv.weight'] = t + + bias_shape = model_params['encoder.blocks.' + str(i) + + '.attn.query.bias'].shape + dtype = model_params['encoder.blocks.' + str(i) + + '.attn.query.bias'].dtype + fused_bias = torch.cat([ + model_params['encoder.blocks.' + str(i) + '.attn.query.bias'], + torch.zeros([*bias_shape], dtype=dtype), + model_params['encoder.blocks.' + str(i) + '.attn.value.bias'] + ], + dim=0).contiguous() + + weights[f'encoder_layers.{i}.attention.qkv.bias'] = fused_bias + + t = model_params['encoder.blocks.' + str(i) + + '.attn.out.weight'].contiguous() + + weights[f'encoder_layers.{i}.attention.dense.weight'] = t + weights[f'encoder_layers.{i}.attention.dense.bias'] = model_params[ + 'encoder.blocks.' + str(i) + '.attn.out.bias'].contiguous() + + weights[f'encoder_layers.{i}.mlp_layernorm.weight'] = model_params[ + 'encoder.blocks.' + str(i) + '.mlp_ln.weight'].contiguous() + weights[f'encoder_layers.{i}.mlp_layernorm.bias'] = model_params[ + 'encoder.blocks.' + str(i) + '.mlp_ln.bias'].contiguous() + + t = model_params['encoder.blocks.' + str(i) + + '.mlp.0.weight'].contiguous() + weights[f'encoder_layers.{i}.mlp.fc.weight'] = t + + weights[f'encoder_layers.{i}.mlp.fc.bias'] = model_params[ + 'encoder.blocks.' + str(i) + '.mlp.0.bias'].contiguous() + + t = model_params['encoder.blocks.' + str(i) + + '.mlp.2.weight'].contiguous() + weights[f'encoder_layers.{i}.mlp.proj.weight'] = t + + weights[f'encoder_layers.{i}.mlp.proj.bias'] = model_params[ + 'encoder.blocks.' + str(i) + '.mlp.2.bias'].contiguous() + + weights['ln_post.weight'] = model_params[ + 'encoder.ln_post.weight'].contiguous() + weights['ln_post.bias'] = model_params['encoder.ln_post.bias'].contiguous() + + return weight_only_quantize_dict(weights, + quant_algo=quant_algo, + plugin=True) + + +def convert_openai_whisper_decoder(model_metadata: dict, + model_params: dict, + quant_algo: str = None, + model_path: str = None): + + weights = {} + + weights['embedding.vocab_embedding.weight'] = model_params[ + 'decoder.token_embedding.weight'] + weights['lm_head.weight'] = model_params[ + 'decoder.token_embedding.weight'].clone() + weights['embedding.position_embedding.weight'] = model_params[ + 'decoder.positional_embedding'] + if 'distill-whisper-large-v2-multi-hans' in model_path: + # WARNING: This is a hack to fix the position embedding for the large-v2-turbo-multi-hans pretrained model + weights['embedding.position_embedding.weight'] *= 2 + + for i in range(model_metadata['n_text_layer']): + t = torch.cat([ + model_params['decoder.blocks.' + str(i) + '.attn.query.weight'], + model_params['decoder.blocks.' + str(i) + '.attn.key.weight'], + model_params['decoder.blocks.' + str(i) + '.attn.value.weight'] + ], + dim=0) + dst = weights[f'decoder_layers.{i}.self_attention.qkv.weight'] = t + + t = model_params['decoder.blocks.' + str(i) + + '.attn.out.weight'].contiguous() + dst = weights[f'decoder_layers.{i}.self_attention.dense.weight'] = t + + bias_shape = model_params['decoder.blocks.' + str(i) + + '.attn.query.bias'].shape + dtype = model_params['decoder.blocks.' + str(i) + + '.attn.query.bias'].dtype + weights[f'decoder_layers.{i}.self_attention.qkv.bias'] = torch.cat( + [ + model_params['decoder.blocks.' + str(i) + '.attn.query.bias'], + torch.zeros([*bias_shape], dtype=dtype), + model_params['decoder.blocks.' + str(i) + '.attn.value.bias'] + ], + dim=0) + weights[f'decoder_layers.{i}.self_attention.dense.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.attn.out.bias'] + + weights[ + f'decoder_layers.{i}.self_attention_layernorm.weight'] = model_params[ + 'decoder.blocks.' + str(i) + '.attn_ln.weight'] + weights[ + f'decoder_layers.{i}.self_attention_layernorm.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.attn_ln.bias'] + + t = torch.cat([ + model_params['decoder.blocks.' + str(i) + + '.cross_attn.query.weight'], + model_params['decoder.blocks.' + str(i) + '.cross_attn.key.weight'], + model_params['decoder.blocks.' + str(i) + + '.cross_attn.value.weight'] + ], + dim=0) + dst = weights[f'decoder_layers.{i}.cross_attention.qkv.weight'] = t + + t = model_params['decoder.blocks.' + str(i) + + '.cross_attn.out.weight'].contiguous() + dst = weights[f'decoder_layers.{i}.cross_attention.dense.weight'] = t + + bias_shape = model_params['decoder.blocks.' + str(i) + + '.cross_attn.query.bias'].shape + dtype = model_params['decoder.blocks.' + str(i) + + '.cross_attn.query.bias'].dtype + cross_attn_qkv_bias = torch.cat([ + model_params['decoder.blocks.' + str(i) + '.cross_attn.query.bias'], + torch.zeros([*bias_shape], dtype=dtype), + model_params['decoder.blocks.' + str(i) + '.cross_attn.value.bias'] + ], + dim=0) + + weights[ + f'decoder_layers.{i}.cross_attention.qkv.bias'] = cross_attn_qkv_bias + + weights[ + f'decoder_layers.{i}.cross_attention.dense.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.cross_attn.out.bias'] + + weights[ + f'decoder_layers.{i}.cross_attention_layernorm.weight'] = model_params[ + 'decoder.blocks.' + str(i) + '.cross_attn_ln.weight'] + weights[ + f'decoder_layers.{i}.cross_attention_layernorm.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.cross_attn_ln.bias'] + + t = model_params['decoder.blocks.' + str(i) + + '.mlp.0.weight'].contiguous() + weights[f'decoder_layers.{i}.mlp.fc.weight'] = t + + t = model_params['decoder.blocks.' + str(i) + + '.mlp.2.weight'].contiguous() + weights[f'decoder_layers.{i}.mlp.proj.weight'] = t + + weights[f'decoder_layers.{i}.mlp.fc.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.mlp.0.bias'] + weights[f'decoder_layers.{i}.mlp.proj.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.mlp.2.bias'] + + weights[f'decoder_layers.{i}.mlp_layernorm.weight'] = model_params[ + 'decoder.blocks.' + str(i) + '.mlp_ln.weight'] + weights[f'decoder_layers.{i}.mlp_layernorm.bias'] = model_params[ + 'decoder.blocks.' + str(i) + '.mlp_ln.bias'] + + weights['final_layernorm.weight'] = model_params['decoder.ln.weight'] + weights['final_layernorm.bias'] = model_params['decoder.ln.bias'] + + return weight_only_quantize_dict(weights, + quant_algo=quant_algo, + plugin=True) + + +if __name__ == '__main__': + # TODO(qijun): Currently, the convert script depends on a torch op: + # torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix, + # which is included in tensorrt_llm Python package. Otherwise, the convert + # script does not need to import tensorrt_llm. Will remove it after reimplementing + # the op with PyTorch. + print(tensorrt_llm.__version__) + args = parse_arguments() + tik = time.time() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + quant_algo = None + plugin_weight_only_quant_type = None + if args.use_weight_only and args.weight_only_precision == 'int8': + plugin_weight_only_quant_type = torch.int8 + quant_algo = QuantAlgo.W8A16 + elif args.use_weight_only and args.weight_only_precision == 'int4': + plugin_weight_only_quant_type = torch.quint4x2 + quant_algo = QuantAlgo.W4A16 + elif args.use_weight_only and args.weight_only_precision == 'int4_gptq': + quant_algo = QuantAlgo.W4A16_GPTQ + + model_path = args.model_path + assert os.path.exists(model_path), f"Model {model_path} does not exist." + + model = torch.load(model_path, map_location='cpu') + print(f"Loaded model from {model_path}") + model_metadata = model['dims'] + model_state_dict = model['model_state_dict'] + for param_tensor in model_state_dict: + model_state_dict[param_tensor] = model_state_dict[param_tensor].half() + + def convert_and_save(component: str = "encoder", model_path: str = None): + # call get_encoder_config or get_decoder_config according to component + if component == "encoder": + config = get_encoder_config(model_metadata, args.dtype, quant_algo) + else: + config = get_decoder_config(model_metadata, args.dtype, + args.logits_dtype, quant_algo) + + if args.use_weight_only and args.weight_only_precision == 'int4_gptq': + config['quantization'].update({ + 'has_zero_point': True, + }) + + component_save_dir = os.path.join(args.output_dir, component) + if not os.path.exists(component_save_dir): + os.makedirs(component_save_dir) + + with open(os.path.join(component_save_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + + if component == "encoder": + weights = convert_openai_whisper_encoder(model_metadata, + model_state_dict, + quant_algo=quant_algo) + else: + assert component == "decoder" + weights = convert_openai_whisper_decoder(model_metadata, + model_state_dict, + quant_algo=quant_algo, + model_path=model_path) + + save_file(weights, os.path.join(component_save_dir, + f'rank0.safetensors')) + + print("Converting encoder checkpoints...") + convert_and_save("encoder") + print("Converting decoder checkpoints...") + convert_and_save("decoder", model_path) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') \ No newline at end of file diff --git a/triton/whisper/docker-compose.yml b/triton/whisper/docker-compose.yml new file mode 100644 index 000000000..392eb6090 --- /dev/null +++ b/triton/whisper/docker-compose.yml @@ -0,0 +1,20 @@ +services: + asr: + image: soar97/triton-whisper:24.09 + shm_size: '1gb' + ports: + - "8000:8000" + - "8001:8001" + - "8002:8002" + environment: + - PYTHONIOENCODING=utf-8 + - MODEL_ID=${MODEL_ID} + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] + command: > + /bin/bash -c "bash prepare.sh $MODEL_ID" diff --git a/triton/whisper/fill_template.py b/triton/whisper/fill_template.py new file mode 100644 index 000000000..584a9f420 --- /dev/null +++ b/triton/whisper/fill_template.py @@ -0,0 +1,42 @@ +#! /usr/bin/env python3 +from argparse import ArgumentParser +from string import Template + + +def main(file_path, substitutions, in_place, participant_ids): + with open(file_path) as f: + pbtxt = Template(f.read()) + + sub_dict = {"max_queue_size": 0} + sub_dict["participant_ids"] = participant_ids + for sub in substitutions.split(","): + key, value = sub.split(":") + sub_dict[key] = value + + pbtxt = pbtxt.safe_substitute(sub_dict) + + if in_place: + with open(file_path, "w") as f: + f.write(pbtxt) + else: + print(pbtxt) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("file_path", help="path of the .pbtxt to modify") + parser.add_argument( + "substitutions", + help= + "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..." + ) + parser.add_argument("--in_place", + "-i", + action="store_true", + help="do the operation in-place") + parser.add_argument("--participant_ids", + help="Participant IDs for the model", + default="") + args = parser.parse_args() + + main(**vars(args)) diff --git a/triton/whisper/media/Screenshot.jpg b/triton/whisper/media/Screenshot.jpg new file mode 100644 index 000000000..2e53cd6ce Binary files /dev/null and b/triton/whisper/media/Screenshot.jpg differ diff --git a/triton/whisper/model_repo_whisper_trtllm/infer_bls/1/model.py b/triton/whisper/model_repo_whisper_trtllm/infer_bls/1/model.py new file mode 100644 index 000000000..1c9c1e2a3 --- /dev/null +++ b/triton/whisper/model_repo_whisper_trtllm/infer_bls/1/model.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +import json +import re +from collections import OrderedDict +from pathlib import Path + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack + +from .tokenizer import get_tokenizer + + +def read_config(component, engine_dir): + config_path = engine_dir / component / 'config.json' + with open(config_path, 'r') as f: + config = json.load(f) + model_config = OrderedDict() + model_config.update(config['pretrained_config']) + model_config.update(config['build_config']) + return model_config + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args['model_config']) + + output0_config = pb_utils.get_output_config_by_name( + model_config, "TRANSCRIPTS") + self.out0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + engine_dir = Path( + self.model_config['parameters']['engine_dir']["string_value"]) + encoder_config = read_config('encoder', engine_dir) + self.tokenizer = get_tokenizer( + num_languages=encoder_config['num_languages'] + ) + self.blank = self.tokenizer.encode( + " ", + allowed_special=self.tokenizer.special_tokens_set + )[0] + self.device = torch.device("cuda") + + def process_batch(self, wav_batch, wav_lens, prompt_id): + # Convert numpy arrays to torch tensors + wav_batch = torch.from_numpy(wav_batch).to(self.device) + wav_tensor = pb_utils.Tensor.from_dlpack( + "WAV", + to_dlpack(wav_batch) + ) + wav_len_tensor = pb_utils.Tensor( + "WAV_LENS", + wav_lens.astype(np.int32) + ) + + # Replicate prompt_id for batch size + batch_size = wav_batch.shape[0] + prompt_ids = np.tile(prompt_id, (batch_size, 1)) + prompt_ids_tensor = pb_utils.Tensor( + "DECODER_INPUT_IDS", + prompt_ids.astype(np.int32) + ) + + infer_request = pb_utils.InferenceRequest( + model_name="whisper", + requested_output_names=["OUTPUT_IDS"], + inputs=[wav_tensor, wav_len_tensor, prompt_ids_tensor] + ) + + inference_response = infer_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException( + inference_response.error().message()) + + output_ids = pb_utils.get_output_tensor_by_name( + inference_response, "OUTPUT_IDS") + return output_ids.as_numpy() + + def execute(self, requests): + responses = [] + + for request in requests: + # Get batch inputs + text_prefix = pb_utils.get_input_tensor_by_name( + request, "TEXT_PREFIX").as_numpy() + wav_batch = pb_utils.get_input_tensor_by_name( + request, "WAV").as_numpy() + wav_lens = pb_utils.get_input_tensor_by_name( + request, "WAV_LENS").as_numpy() + + # Use the same text_prefix for all items in the request + prefix = text_prefix[0][0].decode('utf-8') + if prefix == "": + prefix = ( + "<|startoftranscript|><|ko|><|transcribe|><|notimestamps|>" + ) + prompt_id = self.tokenizer.encode( + prefix, + allowed_special=self.tokenizer.special_tokens_set + ) + + # Process the entire batch + output_ids = self.process_batch(wav_batch, wav_lens, prompt_id) + + # Decode outputs for each item in batch + transcripts = [] + + # Handle case where output_ids is 3-dimensional + # ([batch_size, beam_size, seq_len]) + if len(output_ids.shape) == 3: + output_ids = output_ids[:, 0, :] # Remove beam_size dimension + + for output_id in output_ids: + token_list = output_id.tolist() + s = self.tokenizer.decode(token_list) + s = re.sub(r'<\|.*?\|>', '', s) + transcripts.append(s) + + # Create response tensor + out0 = pb_utils.Tensor( + "TRANSCRIPTS", + np.array(transcripts).astype(self.out0_dtype) + ) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out0] + ) + responses.append(inference_response) + + return responses + + def finalize(self): + print('Cleaning up...') diff --git a/triton/whisper/model_repo_whisper_trtllm/infer_bls/1/tokenizer.py b/triton/whisper/model_repo_whisper_trtllm/infer_bls/1/tokenizer.py new file mode 100644 index 000000000..a24e950f9 --- /dev/null +++ b/triton/whisper/model_repo_whisper_trtllm/infer_bls/1/tokenizer.py @@ -0,0 +1,184 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py +import base64 +import os + +import tiktoken + +LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", + "yue": "cantonese", +} + + +def get_tokenizer(name: str = "multilingual", + num_languages: int = 99, + tokenizer_dir: str = None): + if tokenizer_dir is None: + vocab_path = os.path.join(os.path.dirname(__file__), + f"./{name}.tiktoken") + else: + vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken") + ranks = { + base64.b64decode(token): int(rank) + for token, rank in (line.split() for line in open(vocab_path) if line) + } + n_vocab = len(ranks) + special_tokens = {} + + specials = [ + "<|endoftext|>", + "<|startoftranscript|>", + *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nospeech|>", + "<|notimestamps|>", + *[f"<|{i * 0.02:.2f}|>" for i in range(1501)], + ] + + for token in specials: + special_tokens[token] = n_vocab + n_vocab += 1 + + return tiktoken.Encoding( + name=os.path.basename(vocab_path), + explicit_n_vocab=n_vocab, + pat_str= + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + mergeable_ranks=ranks, + special_tokens=special_tokens, + ) + + +if __name__ == "__main__": + enc = get_tokenizer() + mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>" + encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set) + mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359]) + mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359]) + #print(encoding, mystr, mystr2) + print( + enc.encode("<|startoftranscript|>", + allowed_special=enc.special_tokens_set)[0]) + print( + enc.encode("<|endoftext|>", + allowed_special=enc.special_tokens_set)[0]) + my_zh_str = "好好学习" + encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set) + decoding = enc.decode(encoding) + print(type(decoding)) + #print(encoding, decoding) diff --git a/triton/whisper/model_repo_whisper_trtllm/infer_bls/config.pbtxt b/triton/whisper/model_repo_whisper_trtllm/infer_bls/config.pbtxt new file mode 100644 index 000000000..93df72f4c --- /dev/null +++ b/triton/whisper/model_repo_whisper_trtllm/infer_bls/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "infer_bls" +backend: "python" +max_batch_size: ${triton_max_batch_size} + +parameters [ + { + key: "engine_dir" + value: { string_value: "${engine_dir}"} + } +] + +input [ + { + name: "TEXT_PREFIX" + data_type: TYPE_STRING + dims: [1] + }, + { + name: "WAV" + data_type: TYPE_FP32 + dims: [-1] + }, + { + name: "WAV_LENS" + data_type: TYPE_INT32 + dims: [1] + optional: True + } +] + +output [ + { + name: "TRANSCRIPTS" + data_type: TYPE_STRING + dims: [1] + } +] + +dynamic_batching { + max_queue_delay_microseconds: ${max_queue_delay_microseconds} + } +instance_group [ + { + count: 8 + kind: KIND_CPU + } + ] diff --git a/triton/whisper/model_repo_whisper_trtllm/whisper/1/fbank.py b/triton/whisper/model_repo_whisper_trtllm/whisper/1/fbank.py new file mode 100755 index 000000000..f60927fd9 --- /dev/null +++ b/triton/whisper/model_repo_whisper_trtllm/whisper/1/fbank.py @@ -0,0 +1,98 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py +import numpy as np +import torch +import torch.nn.functional as F +from typing import Union +import os + +def mel_filters(device, n_mels: int =128) -> torch.Tensor: + """ + load the mel filterbank matrix for projecting STFT into a Mel spectrogram. + Allows decoupling librosa dependency; saved using: + + np.savez_compressed( + "mel_filters.npz", + mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), + ) + """ + assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}" + with np.load( + os.path.join(os.path.dirname(__file__), "mel_filters.npz") + ) as f: + return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + + +def log_mel_spectrogram( + audio: Union[torch.Tensor], + filters: torch.Tensor, + n_mels: int = 128, + n_fft: int = 400, + hop_length: int = 160, +): + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio: Union[str, np.ndarray, torch.Tensor], shape = (*) + The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz + + n_mels: int + The number of Mel-frequency filters, only 80 or 128 is supported + + filters: torch.Tensor + + Returns + ------- + torch.Tensor, shape = (128, n_frames) + A Tensor that contains the Mel spectrogram + """ + window = torch.hann_window(n_fft).to(audio.device) + stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True) + magnitudes = stft[..., :-1].abs() ** 2 + + mel_spec = filters @ magnitudes + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + # cast to float 16 + log_spec = log_spec.half() + return log_spec + +class FeatureExtractor(torch.nn.Module): + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def __init__(self, n_mels: int = 128): + self.device = torch.device("cuda") + self.n_mels = n_mels + self.filters = mel_filters(self.device, n_mels=self.n_mels) + + def compute_feature(self, wav, padding_target_len: int = 3000): + """ + Compute the log-Mel spectrogram of the input audio waveform. + mel: [1, feature_dim, seq_len] + """ + mel = log_mel_spectrogram(wav, self.filters) + assert padding_target_len <= 3000, f"padding must be less than 3000, got {padding}" + if mel.shape[1] < padding_target_len: + mel = F.pad(mel, (0, padding_target_len - mel.shape[1]), mode='constant') + if mel.shape[1] % 2: + # pad to even length for remove_padding case, since conv1d requires even length + mel = torch.nn.functional.pad(mel, (0, 1)) + mel = mel.unsqueeze(0) + return mel \ No newline at end of file diff --git a/triton/whisper/model_repo_whisper_trtllm/whisper/1/model.py b/triton/whisper/model_repo_whisper_trtllm/whisper/1/model.py new file mode 100644 index 000000000..689c9e886 --- /dev/null +++ b/triton/whisper/model_repo_whisper_trtllm/whisper/1/model.py @@ -0,0 +1,125 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from pathlib import Path + +import torch +import triton_python_backend_utils as pb_utils +from tensorrt_llm.bindings import GptJsonConfig +from tensorrt_llm.runtime import ModelRunnerCpp +from torch.utils.dlpack import from_dlpack + +from .fbank import FeatureExtractor + + +class TritonPythonModel: + def initialize(self, args): + parameters = json.loads(args['model_config'])['parameters'] + for key, value in parameters.items(): + parameters[key] = value["string_value"] + engine_dir = parameters["engine_dir"] + config_path = Path(engine_dir) / 'decoder' / 'config.json' + json_config = GptJsonConfig.parse_file(config_path) + assert json_config.model_config.supports_inflight_batching + runner_kwargs = dict( + engine_dir=engine_dir, + is_enc_dec=True, + max_batch_size=64, + max_input_len=3000, + max_output_len=96, + max_beam_width=1, + debug_mode=False, + kv_cache_free_gpu_memory_fraction=0.5, + ) + self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs) + self.feature_extractor = FeatureExtractor( + n_mels=int(parameters["n_mels"]) + ) + self.zero_pad = parameters["zero_pad"] == "true" + self.eot_id = 50257 + + def execute(self, requests): + responses = [] + + for request in requests: + wav_tensor = pb_utils.get_input_tensor_by_name(request, "WAV") + wav_lens = pb_utils.get_input_tensor_by_name( + request, "WAV_LENS").as_numpy() + prompt_ids = pb_utils.get_input_tensor_by_name( + request, "DECODER_INPUT_IDS").as_numpy() + + # Move WAV data to GPU + wav = from_dlpack(wav_tensor.to_dlpack()) + batch_size = wav.shape[0] + + padding = 0 if self.zero_pad else 3000 + batch_mel_list = [] + + # Batch processing for each sample in the batch + for i in range(batch_size): + wav_i = wav[i:i+1, :int(wav_lens[i].item())] + mel = self.feature_extractor.compute_feature( + wav_i[0].to('cuda'), + padding_target_len=padding + ).transpose(1, 2) + batch_mel_list.append(mel.squeeze(0)) + + # Move prompt IDs to GPU + decoder_input_ids = torch.tensor( + prompt_ids, dtype=torch.int32, device='cuda') + + # Calculate mel lengths + mel_input_lengths = torch.tensor( + [mel.shape[0] for mel in batch_mel_list], + dtype=torch.int32, + device='cuda' + ) + + # Run batch inference + outputs = self.model_runner_cpp.generate( + batch_input_ids=decoder_input_ids, + encoder_input_features=batch_mel_list, + encoder_output_lengths=mel_input_lengths // 2, + max_new_tokens=96, + end_id=self.eot_id, + pad_id=self.eot_id, + num_beams=1, + output_sequence_lengths=True, + return_dict=True + ) + torch.cuda.synchronize() + + # Process outputs + output_ids = outputs['output_ids'].cpu().numpy() + + # Create response for the request + response = pb_utils.InferenceResponse(output_tensors=[ + pb_utils.Tensor("OUTPUT_IDS", output_ids) + ]) + responses.append(response) + + return responses diff --git a/triton/whisper/model_repo_whisper_trtllm/whisper/config.pbtxt b/triton/whisper/model_repo_whisper_trtllm/whisper/config.pbtxt new file mode 100755 index 000000000..b50d19958 --- /dev/null +++ b/triton/whisper/model_repo_whisper_trtllm/whisper/config.pbtxt @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "whisper" +backend: "python" +max_batch_size: ${triton_max_batch_size} + +dynamic_batching { + max_queue_delay_microseconds: ${max_queue_delay_microseconds} +} + +parameters [ + { + key: "engine_dir" + value: { string_value: "${engine_dir}"} + }, + { + key: "n_mels", + value: {string_value:"${n_mels}"} # 128 dim for large-v3, 80 dim for large-v2 + }, + { + key: "zero_pad" + value: {string_value: "${zero_pad}"} + } +] + +input [ + { + name: "WAV" + data_type: TYPE_FP32 + dims: [-1] + optional: True + }, + { + name: "WAV_LENS" + data_type: TYPE_INT32 + dims: [1] + optional: True + }, + { + name: "DECODER_INPUT_IDS" + data_type: TYPE_INT32 + dims: [-1] + optional: True + } +] +output [ + { + name: "OUTPUT_IDS" + data_type: TYPE_INT32 + dims: [-1] + } +] + +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] \ No newline at end of file diff --git a/triton/whisper/prepare.sh b/triton/whisper/prepare.sh new file mode 100644 index 000000000..18a0f2bb4 --- /dev/null +++ b/triton/whisper/prepare.sh @@ -0,0 +1,93 @@ +# Download Models https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17 +declare -A MODELS=( + ["large-v3"]="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt" + ["large-v3-turbo"]="https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt" + ["large-v2-multi-hans"]="https://huggingface.co/yuekai/icefall_asr_multi-hans-zh_whisper/resolve/main/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt" + ["large-v2-turbo-multi-hans"]="https://huggingface.co/yuekai/icefall_asr_multi-hans-zh_whisper/resolve/main/v1-distill/distill-whisper-large-v2-multi-hans-epoch-6-avg-8.pt" +) + +build_model() { + local model_id=$1 + local checkpoint_dir=$2 + local output_dir=$3 + + local URL=${MODELS[$model_id]} + + echo "Downloading $MODEL_ID from $URL..." + wget -nc "$URL" + + echo "Converting checkpoint for model: $model_id" + python3 convert_checkpoint.py \ + --output_dir "$checkpoint_dir" \ + --model_path "$(basename $URL)" + + local INFERENCE_PRECISION=float16 + local MAX_BEAM_WIDTH=4 + local MAX_BATCH_SIZE=64 + + echo "Building encoder for model: $model_id" + trtllm-build --checkpoint_dir "${checkpoint_dir}/encoder" \ + --output_dir "${output_dir}/encoder" \ + --moe_plugin disable \ + --enable_xqa disable \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --gemm_plugin disable \ + --bert_attention_plugin "$INFERENCE_PRECISION" \ + --max_input_len 3000 --max_seq_len 3000 + + echo "Building decoder for model: $model_id" + trtllm-build --checkpoint_dir "${checkpoint_dir}/decoder" \ + --output_dir "${output_dir}/decoder" \ + --moe_plugin disable \ + --enable_xqa disable \ + --max_beam_width "$MAX_BEAM_WIDTH" \ + --max_batch_size "$MAX_BATCH_SIZE" \ + --max_seq_len 114 \ + --max_input_len 14 \ + --max_encoder_input_len 3000 \ + --gemm_plugin "$INFERENCE_PRECISION" \ + --bert_attention_plugin "$INFERENCE_PRECISION" \ + --gpt_attention_plugin "$INFERENCE_PRECISION" +} + +launch_triton_repo_python_backend() { + local output_dir=$1 + n_mels=$(cat ${output_dir}/encoder/config.json | grep n_mels | awk -F': ' '{print $2}' | tr -d ',') + if [[ "$output_dir" == *"multi-hans"* ]]; then + zero_pad=true # fine-tuned model could remove 30s padding, so set pad to none + else + zero_pad=false + fi + + echo "output_dir: $output_dir", "n_mels: $n_mels", "zero_pad: $zero_pad" + + model_repo=model_repo_whisper + rm -rf $model_repo + cp model_repo_whisper_trtllm $model_repo -r + wget -nc --directory-prefix=$model_repo/infer_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken + wget -nc --directory-prefix=$model_repo/whisper/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + + TRITON_MAX_BATCH_SIZE=64 + MAX_QUEUE_DELAY_MICROSECONDS=100 + python3 fill_template.py -i $model_repo/whisper/config.pbtxt engine_dir:${output_dir},n_mels:$n_mels,zero_pad:$zero_pad,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS} + python3 fill_template.py -i $model_repo/infer_bls/config.pbtxt engine_dir:${output_dir},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS} + echo "Launching triton server with model_repo: $model_repo" + tritonserver --model-repository=$model_repo +} + + +MODEL_IDs=("large-v3-turbo" "large-v3" "large-v2-turbo-multi-hans" "large-v2-multi-hans") +CUDA_VISIBLE_DEVICES=0 + +model_id=$1 +checkpoint_dir="${model_id}_tllm_checkpoint" +output_dir="whisper_${model_id}" + +if printf '%s\n' "${MODEL_IDs[@]}" | grep -q "^$model_id$"; then + build_model $model_id "$checkpoint_dir" "$output_dir" || exit 1 + launch_triton_repo_python_backend "$output_dir" || exit 1 +else + echo "$model_id is NOT in the MODEL_IDs array." + exit 1 +fi + diff --git a/triton/whisper/requirements.txt b/triton/whisper/requirements.txt new file mode 100644 index 000000000..c55c1155f --- /dev/null +++ b/triton/whisper/requirements.txt @@ -0,0 +1,14 @@ +tiktoken +datasets +kaldialign +openai-whisper +librosa +soundfile +safetensors +transformers +janus +tritonclient[grpc]==2.31 +scipy +numpy +huggingface_hub[cli] +tensorrt-llm==0.15.0.dev2024101500 \ No newline at end of file diff --git a/triton/zipformer/model_repo_offline_bs/ctc_model/config.pbtxt b/triton/zipformer/model_repo_offline_bs/ctc_model/config.pbtxt new file mode 100755 index 000000000..db2dc4b51 --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/ctc_model/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "ctc_model" +backend: "onnxruntime" +default_model_filename: "ctc_output.onnx" + +max_batch_size: 512 +input [ + { + name: "encoder_out" + data_type: TYPE_FP32 + dims: [-1, 384] + } +] +output [ + { + name: "ctc_output" + data_type: TYPE_FP32 + dims: [-1, 500] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: 2 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/decoder/1/.gitkeep b/triton/zipformer/model_repo_offline_bs/decoder/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/scripts/start_server.sh b/triton/zipformer/model_repo_offline_bs/decoder/config.pbtxt similarity index 56% rename from triton/scripts/start_server.sh rename to triton/zipformer/model_repo_offline_bs/decoder/config.pbtxt index 6fec73ad4..e8d323f09 100755 --- a/triton/scripts/start_server.sh +++ b/triton/zipformer/model_repo_offline_bs/decoder/config.pbtxt @@ -1,6 +1,4 @@ -#!/bin/bash -# -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +11,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -jit_model_dir=/ws/jit_model -model_repo=/ws/model_repo +name: "decoder" +backend: "onnxruntime" +default_model_filename: "decoder.onnx" + +max_batch_size: 512 +input [ + { + name: "y" + data_type: TYPE_INT64 + dims: [2] + } +] + +output [ + { + name: "decoder_out" + data_type: TYPE_FP32 + dims: [ 1, 512 ] + } +] -cp $jit_model_dir/encoder_jit.pt $model_repo/encoder/1 -cp $jit_model_dir/decoder_jit.pt $model_repo/decoder/1 -cp $jit_model_dir/joiner_jit.pt $model_repo/joiner/1 -cp $jit_model_dir/bpe.model /ws +dynamic_batching { + } -# Start server -tritonserver --model-repository=$model_repo \ No newline at end of file +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/encoder/1/.gitkeep b/triton/zipformer/model_repo_offline_bs/encoder/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/model_repo/encoder/config.pbtxt b/triton/zipformer/model_repo_offline_bs/encoder/config.pbtxt similarity index 59% rename from triton/model_repo/encoder/config.pbtxt rename to triton/zipformer/model_repo_offline_bs/encoder/config.pbtxt index 021505452..62678cfff 100755 --- a/triton/model_repo/encoder/config.pbtxt +++ b/triton/zipformer/model_repo_offline_bs/encoder/config.pbtxt @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,57 +13,43 @@ # limitations under the License. name: "encoder" -backend: "pytorch" -default_model_filename: "encoder_jit.pt" +backend: "onnxruntime" +default_model_filename: "encoder.onnx" max_batch_size: 512 input [ { - name: "speech__0" + name: "x" data_type: TYPE_FP32 - dims: [-1, 80] # 80 + dims: [-1, 80] }, { - name: "speech_lengths__1" - data_type: TYPE_INT32 + name: "x_lens" + data_type: TYPE_INT64 dims: [1] reshape: { shape: [ ] } } ] output [ { - name: "encoder_out__0" + name: "encoder_out" data_type: TYPE_FP32 - dims: [-1, 512] # [-1, encoder_dim] + dims: [-1, -1 ] }, { - name: "encoder_out_lens__1" - data_type: TYPE_INT32 + name: "encoder_out_lens" + data_type: TYPE_INT64 dims: [1] reshape: { shape: [ ] } } ] dynamic_batching { - preferred_batch_size: [ 16, 32 ] } instance_group [ { - count: 1 + count: 2 kind: KIND_GPU } ] - -parameters: { -key: "INFERENCE_MODE" - value: { - string_value:"true" - } -} -parameters: { -key: "DISABLE_OPTIMIZED_EXECUTION" - value: { - string_value:"true" - } -} \ No newline at end of file diff --git a/triton/model_repo/feature_extractor/1/model.py b/triton/zipformer/model_repo_offline_bs/feature_extractor/1/model.py similarity index 94% rename from triton/model_repo/feature_extractor/1/model.py rename to triton/zipformer/model_repo_offline_bs/feature_extractor/1/model.py index 6dbc58424..917536888 100755 --- a/triton/model_repo/feature_extractor/1/model.py +++ b/triton/zipformer/model_repo_offline_bs/feature_extractor/1/model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -61,7 +61,7 @@ def initialize(self, args): # Get OUTPUT0 configuration output0_config = pb_utils.get_output_config_by_name( - model_config, "speech__0") + model_config, "speech") # Convert Triton types to numpy types output0_dtype = pb_utils.triton_string_to_numpy( output0_config['data_type']) @@ -72,7 +72,7 @@ def initialize(self, args): # Get OUTPUT1 configuration output1_config = pb_utils.get_output_config_by_name( - model_config, "speech_lengths__1") + model_config, "speech_lengths") # Convert Triton types to numpy types self.output1_dtype = pb_utils.triton_string_to_numpy( output1_config['data_type']) @@ -135,19 +135,20 @@ def execute(self, requests): total_waves.append(wav) features = self.feature_extractor(total_waves) + print(features) for b, l in zip(batch_count, batch_len): expect_feat_len = _kaldifeat.num_frames(l, self.opts.frame_opts) speech = torch.zeros((b, expect_feat_len, self.feature_size), dtype=self.output0_dtype, device=self.device) - speech_lengths = torch.zeros((b, 1), dtype=torch.int32, device=self.device) + speech_lengths = torch.zeros((b, 1), dtype=torch.int64, device=self.device) for i in range(b): f = features.pop(0) f_l = f.shape[0] speech[i, 0: f_l, :] = f.to(self.output0_dtype) speech_lengths[i][0] = f_l - - out0 = pb_utils.Tensor.from_dlpack("speech__0", to_dlpack(speech)) - out1 = pb_utils.Tensor.from_dlpack("speech_lengths__1", + print(f'speech {speech}') + out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech)) + out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths)) inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1]) responses.append(inference_response) diff --git a/triton/model_repo/feature_extractor/config.pbtxt b/triton/zipformer/model_repo_offline_bs/feature_extractor/config.pbtxt similarity index 84% rename from triton/model_repo/feature_extractor/config.pbtxt rename to triton/zipformer/model_repo_offline_bs/feature_extractor/config.pbtxt index 5f954ef9b..e4e41c2a2 100755 --- a/triton/model_repo/feature_extractor/config.pbtxt +++ b/triton/zipformer/model_repo_offline_bs/feature_extractor/config.pbtxt @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -51,23 +51,22 @@ input [ output [ { - name: "speech__0" + name: "speech" data_type: TYPE_FP32 - dims: [-1, 80] # 80 + dims: [-1, 80] }, { - name: "speech_lengths__1" - data_type: TYPE_INT32 + name: "speech_lengths" + data_type: TYPE_INT64 dims: [1] } ] dynamic_batching { - preferred_batch_size: [ 16, 32 ] } instance_group [ { - count: 1 + count: 2 kind: KIND_GPU } ] diff --git a/triton/zipformer/model_repo_offline_bs/frame_reducer/1/model.py b/triton/zipformer/model_repo_offline_bs/frame_reducer/1/model.py new file mode 100755 index 000000000..17834e107 --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/frame_reducer/1/model.py @@ -0,0 +1,280 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack, from_dlpack +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import kaldifeat +import _kaldifeat +from typing import List +import json +import math +from typing import Optional, Tuple + +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + max_len: + The length of masks. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + >>> lengths = torch.tensor([1, 3, 2, 5]) + >>> make_pad_mask(lengths) + tensor([[False, True, True, True, True], + [False, False, False, True, True], + [False, False, True, True, True], + [False, False, False, False, False]]) + """ + assert lengths.ndim == 1, lengths.ndim + max_len = max(max_len, lengths.max()) + n = lengths.size(0) + seq_range = torch.arange(0, max_len, device=lengths.device) + expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len) + + return expaned_lengths >= lengths.unsqueeze(-1) + + +class FrameReducer(torch.nn.Module): + """The encoder output is first used to calculate + the CTC posterior probability; then for each output frame, + if its blank posterior is bigger than some thresholds, + it will be simply discarded from the encoder output. + """ + + def __init__( + self, + ): + super().__init__() + + def forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + ctc_output: torch.Tensor, + y_lens: Optional[torch.Tensor] = None, + blank_id: int = 0, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The shared encoder output with shape [N, T, C]. + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + ctc_output: + The CTC output with shape [N, T, vocab_size]. + y_lens: + A tensor of shape (batch_size,) containing the number of frames in + `y` before padding. + blank_id: + The blank id of ctc_output. + Returns: + out: + The frame reduced encoder output with shape [N, T', C]. + out_lens: + A tensor of shape (batch_size,) containing the number of frames in + `out` before padding. + """ + N, T, C = x.size() + + padding_mask = make_pad_mask(x_lens) + left = ctc_output[:, :, blank_id] < math.log(0.9) + non_blank_mask = torch.logical_and(left.to(x.device), (~padding_mask)) + #non_blank_mask = left * (~padding_mask) + + out_lens = non_blank_mask.sum(dim=1).to(x.device) + max_len = out_lens.max() + + pad_lens_list = ( + torch.full_like( + out_lens, + max_len.item(), + device=x.device, + ) + - out_lens + ) + max_pad_len = pad_lens_list.max() + + out = F.pad(x, (0, 0, 0, max_pad_len)) + + valid_pad_mask = ~make_pad_mask(pad_lens_list) + total_valid_mask = torch.concat([non_blank_mask, valid_pad_mask], dim=1) + + out = out[total_valid_mask].reshape(N, -1, C) + + return out, out_lens + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + if "GPU" in model_config["instance_group"][0]["kind"]: + self.device = "cuda" + else: + self.device = "cpu" + + # Get INPUT configuration + encoder_config = pb_utils.get_input_config_by_name( + model_config, "x") + self.data_type = pb_utils.triton_string_to_numpy( + encoder_config['data_type']) + if self.data_type == np.float32: + self.torch_dtype = torch.float32 + else: + self.torch_dtype = torch.float16 + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "out") + # Convert Triton types to numpy types + output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + if output0_dtype == np.float32: + self.output0_dtype = torch.float32 + else: + self.output0_dtype = torch.float16 + + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "out_lens") + # Convert Triton types to numpy types + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + params = self.model_config['parameters'] + self.frame_reducer= FrameReducer() + + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + batch_encoder_out_list, batch_encoder_lens_list, batch_ctc_out_list = [], [], [] + batchsize_lists = [] + total_seqs = 0 + encoder_max_len = 0 + batch_masks = [] + + for request in requests: + # Perform inference on the request and append it to responses list... + in_0 = pb_utils.get_input_tensor_by_name(request, "x") + in_1 = pb_utils.get_input_tensor_by_name(request, "x_lens") + + batch_encoder_out_list.append(from_dlpack(in_0.to_dlpack())) + encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1]) + + cur_b_lens = from_dlpack(in_1.to_dlpack()) + + batch_encoder_lens_list.append(cur_b_lens) + cur_batchsize = cur_b_lens.shape[0] + batchsize_lists.append(cur_batchsize) + total_seqs += cur_batchsize + + encoder_out = torch.zeros((total_seqs, encoder_max_len, 384), + dtype=self.torch_dtype, device=self.device) + encoder_out_lens = torch.zeros(total_seqs, dtype=torch.int64, device=self.device) + st = 0 + + + for b in batchsize_lists: + t = batch_encoder_out_list.pop(0) + encoder_out[st:st + b, 0:t.shape[1]] = t + encoder_out_lens[st:st + b] = batch_encoder_lens_list.pop(0) + + st += b + + in_tensor_0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out)) + + inference_request = pb_utils.InferenceRequest( + model_name='ctc_model', + requested_output_names=['ctc_output'], + inputs=[in_tensor_0]) + + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + ctc_out = pb_utils.get_output_tensor_by_name(inference_response, 'ctc_output') + ctc_out = from_dlpack(ctc_out.to_dlpack()) + + in_tensor_0 = pb_utils.Tensor.from_dlpack("lconv_input", to_dlpack(encoder_out)) + in_tensor_1 = pb_utils.Tensor.from_dlpack("lconv_input_lens", to_dlpack(encoder_out_lens.unsqueeze(-1))) + + input_tensors = [in_tensor_0, in_tensor_1] + inference_request = pb_utils.InferenceRequest( + model_name='lconv', + requested_output_names=['lconv_out'], + inputs=input_tensors) + + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + lconv_out = pb_utils.get_output_tensor_by_name(inference_response, 'lconv_out') + lconv_out = from_dlpack(lconv_out.to_dlpack()) + + out, out_lens = self.frame_reducer(encoder_out, encoder_out_lens, ctc_out) + + st = 0 + responses = [] + for b in batchsize_lists: + speech = out[st:st+b] + speech_lengths = out_lens[st:st+b] + out0 = pb_utils.Tensor.from_dlpack("out", to_dlpack(speech)) + out1 = pb_utils.Tensor.from_dlpack("out_lens", + to_dlpack(speech_lengths)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1]) + responses.append(inference_response) + return responses diff --git a/triton/zipformer/model_repo_offline_bs/frame_reducer/config.pbtxt b/triton/zipformer/model_repo_offline_bs/frame_reducer/config.pbtxt new file mode 100755 index 000000000..6a0ce7266 --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/frame_reducer/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "frame_reducer" +backend: "python" + +max_batch_size: 512 +input [ + { + name: "x" + data_type: TYPE_FP32 + dims: [-1, -1 ] + }, + { + name: "x_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } +] +output [ + { + name: "out" + data_type: TYPE_FP32 + dims: [-1, 384] + }, + { + name: "out_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } + +] + +dynamic_batching { + } + +instance_group [ + { + count: 2 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/joiner/1/.gitkeep b/triton/zipformer/model_repo_offline_bs/joiner/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/zipformer/model_repo_offline_bs/joiner/config.pbtxt b/triton/zipformer/model_repo_offline_bs/joiner/config.pbtxt new file mode 100755 index 000000000..5ccd50c35 --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/joiner/config.pbtxt @@ -0,0 +1,49 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "joiner" +backend: "onnxruntime" +default_model_filename: "joiner.onnx" + +max_batch_size: 512 +input [ + { + name: "encoder_out" + data_type: TYPE_FP32 + dims: [ 512 ] + }, + { + name: "decoder_out" + data_type: TYPE_FP32 + dims: [ 512 ] + } +] + +output [ + { + name: "logit" + data_type: TYPE_FP32 + dims: [ 500 ] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/joiner_decoder_proj/config.pbtxt b/triton/zipformer/model_repo_offline_bs/joiner_decoder_proj/config.pbtxt new file mode 100755 index 000000000..c2d7672cc --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/joiner_decoder_proj/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "joiner_decoder_proj" +backend: "onnxruntime" +default_model_filename: "joiner_decoder_proj.onnx" + +max_batch_size: 512 +input [ + { + name: "decoder_out" + data_type: TYPE_FP32 + dims: [ 512 ] + } +] +output [ + { + name: "projected_decoder_out" + data_type: TYPE_FP32 + dims: [ 512 ] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: 2 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/joiner_encoder_proj/config.pbtxt b/triton/zipformer/model_repo_offline_bs/joiner_encoder_proj/config.pbtxt new file mode 100755 index 000000000..ce017624f --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/joiner_encoder_proj/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "joiner_encoder_proj" +backend: "onnxruntime" +default_model_filename: "joiner_encoder_proj.onnx" + +max_batch_size: 512 +input [ + { + name: "encoder_out" + data_type: TYPE_FP32 + dims: [ 384 ] + } +] +output [ + { + name: "projected_encoder_out" + data_type: TYPE_FP32 + dims: [ 512 ] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: 2 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/lconv/config.pbtxt b/triton/zipformer/model_repo_offline_bs/lconv/config.pbtxt new file mode 100755 index 000000000..3eb58ea03 --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/lconv/config.pbtxt @@ -0,0 +1,49 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "lconv" +backend: "onnxruntime" +default_model_filename: "lconv.onnx" + +max_batch_size: 512 +input [ + { + name: "lconv_input" + data_type: TYPE_FP32 + dims: [-1, 384] + }, + { + name: "lconv_input_lens" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } +] +output [ + { + name: "lconv_out" + data_type: TYPE_FP32 + dims: [-1, 384] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: 2 + kind: KIND_GPU + } +] diff --git a/triton/zipformer/model_repo_offline_bs/scorer/1/model.py b/triton/zipformer/model_repo_offline_bs/scorer/1/model.py new file mode 100755 index 000000000..a531d39ef --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/scorer/1/model.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import triton_python_backend_utils as pb_utils +import numpy as np + +import json + +import torch +from torch.utils.dlpack import from_dlpack, to_dlpack +import sentencepiece as spm +from icefall.lexicon import Lexicon + +from search import greedy_search + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + # Convert Triton types to numpy types + self.out0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + model_instance_kind = args['model_instance_kind'] + model_instance_device_id = args['model_instance_device_id'] + if model_instance_kind == 'GPU': + self.device = f'cuda:{model_instance_device_id}' + else: + self.device= 'cpu' + + # Get INPUT configuration + encoder_config = pb_utils.get_input_config_by_name( + model_config, "encoder_out") + self.data_type = pb_utils.triton_string_to_numpy( + encoder_config['data_type']) + if self.data_type == np.float32: + self.torch_dtype = torch.float32 + else: + assert self.data_type == np.float16 + self.torch_dtype = torch.float16 + + self.encoder_dim = encoder_config['dims'][-1] + + + self.init_parameters(self.model_config['parameters']) + + def init_parameters(self, parameters): + for key,value in parameters.items(): + parameters[key] = value["string_value"] + self.context_size = int(parameters['context_size']) + self.decoding_method = parameters['decoding_method'] + if 'bpe' in parameters['tokenizer_file']: + sp = spm.SentencePieceProcessor() + sp.load(parameters['tokenizer_file']) + self.blank_id = sp.piece_to_id("") + self.unk_id = sp.piece_to_id("") + self.vocab_size = sp.get_piece_size() + self.tokenizer = sp + else: + assert 'char' in parameters['tokenizer_file'] + lexicon = Lexicon(parameters['tokenizer_file']) + self.unk_id = lexicon.token_table[""] + self.blank_id = lexicon.token_table[""] + self.vocab_size = max(lexicon.tokens) + 1 + self.tokenizer = lexicon + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + # Every Python backend must iterate through list of requests and create + # an instance of pb_utils.InferenceResponse class for each of them. You + # should avoid storing any of the input Tensors in the class attributes + # as they will be overridden in subsequent inference requests. You can + # make a copy of the underlying NumPy array and store it if it is + # required. + + batch_encoder_out_list, batch_encoder_lens_list = [], [] + batchsize_lists = [] + total_seqs = 0 + encoder_max_len = 0 + + for request in requests: + # Perform inference on the request and append it to responses list... + in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out") + in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens") + assert not in_0.is_cpu() + batch_encoder_out_list.append(from_dlpack(in_0.to_dlpack())) + encoder_max_len = max(encoder_max_len, batch_encoder_out_list[-1].shape[1]) + cur_b_lens = from_dlpack(in_1.to_dlpack()) + batch_encoder_lens_list.append(cur_b_lens) + cur_batchsize = cur_b_lens.shape[0] + batchsize_lists.append(cur_batchsize) + total_seqs += cur_batchsize + + encoder_out = torch.zeros((total_seqs, encoder_max_len, self.encoder_dim), + dtype=self.torch_dtype, device=self.device) + encoder_out_lens = torch.zeros(total_seqs, dtype=torch.int64) + st = 0 + + for b in batchsize_lists: + t = batch_encoder_out_list.pop(0) + encoder_out[st:st + b, 0:t.shape[1]] = t + encoder_out_lens[st:st + b] = batch_encoder_lens_list.pop(0) + st += b + + if self.decoding_method == 'greedy_search': + ans = greedy_search(encoder_out, encoder_out_lens, self.context_size, self.unk_id, self.blank_id) + else: + raise NotImplementedError + results = [] + if hasattr(self.tokenizer, 'token_table'): + for i in range(len(ans)): + results.append([self.tokenizer.token_table[idx] for idx in ans[i]]) + else: + for hyp in self.tokenizer.decode(ans): + results.append(hyp.split()) + st = 0 + responses = [] + for b in batchsize_lists: + sents = np.array(results[st:st + b]) + out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out0]) + responses.append(inference_response) + st += b + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/triton/zipformer/model_repo_offline_bs/scorer/1/search.py b/triton/zipformer/model_repo_offline_bs/scorer/1/search.py new file mode 100644 index 000000000..2e758b3b8 --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/scorer/1/search.py @@ -0,0 +1,139 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import triton_python_backend_utils as pb_utils +import numpy as np + +import torch +from torch.utils.dlpack import from_dlpack, to_dlpack + +def forward_joiner(cur_encoder_out, decoder_out): + in_joiner_tensor_0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(cur_encoder_out)) + in_joiner_tensor_1 = pb_utils.Tensor.from_dlpack("decoder_out", to_dlpack(decoder_out.squeeze(1))) + + inference_request = pb_utils.InferenceRequest( + model_name='joiner_encoder_proj', + requested_output_names=['projected_encoder_out'], + inputs=[in_joiner_tensor_0]) + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + proj_encoder_out = pb_utils.get_output_tensor_by_name(inference_response, + 'projected_encoder_out') + proj_encoder_out = from_dlpack(proj_encoder_out.to_dlpack()) + + inference_request = pb_utils.InferenceRequest( + model_name='joiner_decoder_proj', + requested_output_names=['projected_decoder_out'], + inputs=[in_joiner_tensor_1]) + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + proj_decoder_out = pb_utils.get_output_tensor_by_name(inference_response, + 'projected_decoder_out') + proj_decoder_out = from_dlpack(proj_decoder_out.to_dlpack()) + + + proj_encoder = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(proj_encoder_out)) + proj_decoder = pb_utils.Tensor.from_dlpack("decoder_out", to_dlpack(proj_decoder_out)) + + inference_request = pb_utils.InferenceRequest( + model_name='joiner', + requested_output_names=['logit'], + inputs=[proj_encoder, proj_decoder]) + inference_response = inference_request.exec() + + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + logits = pb_utils.get_output_tensor_by_name(inference_response, + 'logit') + logits = torch.utils.dlpack.from_dlpack(logits.to_dlpack()).cpu() + assert len(logits.shape) == 2, logits.shape + return logits + +def forward_decoder(hyps, context_size): + decoder_input = [h[-context_size:] for h in hyps] + + decoder_input = np.asarray(decoder_input,dtype=np.int64) + + in_decoder_input_tensor = pb_utils.Tensor("y", decoder_input) + + inference_request = pb_utils.InferenceRequest( + model_name='decoder', + requested_output_names=['decoder_out'], + inputs=[in_decoder_input_tensor]) + + inference_response = inference_request.exec() + if inference_response.has_error(): + raise pb_utils.TritonModelException(inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + decoder_out = pb_utils.get_output_tensor_by_name(inference_response, + 'decoder_out') + decoder_out = from_dlpack(decoder_out.to_dlpack()) + return decoder_out + + +def greedy_search(encoder_out, encoder_out_lens, context_size, unk_id, blank_id): + + packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence( + input=encoder_out, + lengths=encoder_out_lens.cpu(), + batch_first=True, + enforce_sorted=False + ) + + pack_batch_size_list = packed_encoder_out.batch_sizes.tolist() + + hyps = [[blank_id] * context_size for _ in range(encoder_out.shape[0])] + decoder_out = forward_decoder(hyps, context_size) + + offset = 0 + for batch_size in pack_batch_size_list: + start = offset + end = offset + batch_size + current_encoder_out = packed_encoder_out.data[start:end] + + offset = end + + decoder_out = decoder_out[:batch_size] + + logits = forward_joiner(current_encoder_out, decoder_out) + + assert logits.ndim == 2, logits.shape + y = logits.argmax(dim=1).tolist() + + emitted = False + for i, v in enumerate(y): + if v not in (blank_id, unk_id): + hyps[i].append(v) + emitted = True + if emitted: + decoder_out = forward_decoder(hyps[:batch_size], context_size) + + + sorted_ans = [h[context_size:] for h in hyps] + + ans = [] + unsorted_indices = packed_encoder_out.unsorted_indices.tolist() + for i in range(encoder_out.shape[0]): + ans.append(sorted_ans[unsorted_indices[i]]) + + return ans diff --git a/triton/model_repo/greedy_search/config.pbtxt b/triton/zipformer/model_repo_offline_bs/scorer/config.pbtxt similarity index 72% rename from triton/model_repo/greedy_search/config.pbtxt rename to triton/zipformer/model_repo_offline_bs/scorer/config.pbtxt index 34e83da1f..cc2c5f552 100755 --- a/triton/model_repo/greedy_search/config.pbtxt +++ b/triton/zipformer/model_repo_offline_bs/scorer/config.pbtxt @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: "greedy_search" +name: "scorer" backend: "python" max_batch_size: 512 @@ -22,25 +22,29 @@ parameters [ value: { string_value: "2"} }, { - key: "bpe_model", - value: { string_value: "/ws/bpe.model"} + key: "tokenizer_file", + value: { string_value: "/workspace/bpe.model"} }, { key: "FORCE_CPU_ONLY_INPUT_TENSORS", - value: {string_value:"yes"} + value: {string_value:"no"} + }, + { + key: "decoding_method", + value: { string_value: "greedy_search"} } ] input [ { - name: "encoder_out__0" + name: "encoder_out" data_type: TYPE_FP32 - dims: [-1, 512] # [-1, encoder_out_dim] + dims: [-1, 384] }, { - name: "encoder_out_lens__1" - data_type: TYPE_INT32 + name: "encoder_out_lens" + data_type: TYPE_INT64 dims: [1] reshape: { shape: [ ] } } @@ -55,7 +59,6 @@ output [ ] dynamic_batching { - preferred_batch_size: [ 16, 32 ] } instance_group [ { diff --git a/triton/zipformer/model_repo_offline_bs/transducer/1/.gitkeep b/triton/zipformer/model_repo_offline_bs/transducer/1/.gitkeep new file mode 100755 index 000000000..e69de29bb diff --git a/triton/zipformer/model_repo_offline_bs/transducer/config.pbtxt b/triton/zipformer/model_repo_offline_bs/transducer/config.pbtxt new file mode 100644 index 000000000..f9a07661f --- /dev/null +++ b/triton/zipformer/model_repo_offline_bs/transducer/config.pbtxt @@ -0,0 +1,119 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "transducer" +platform: "ensemble" +max_batch_size: 512 + +input [ + { + name: "WAV" + data_type: TYPE_FP32 + dims: [-1] + }, + { + name: "WAV_LENS" + data_type: TYPE_INT32 + dims: [1] + } +] + +output [ + { + name: "TRANSCRIPTS" + data_type: TYPE_STRING + dims: [1] + } +] + +ensemble_scheduling { + step [ + { + model_name: "feature_extractor" + model_version: -1 + input_map { + key: "wav" + value: "WAV" + } + input_map { + key: "wav_lens" + value: "WAV_LENS" + } + output_map { + key: "speech" + value: "SPEECH" + } + output_map { + key: "speech_lengths" + value: "SPEECH_LENGTHS" + } + }, + { + model_name: "encoder" + model_version: -1 + input_map { + key: "x" + value: "SPEECH" + } + input_map { + key: "x_lens" + value: "SPEECH_LENGTHS" + } + output_map { + key: "encoder_out" + value: "encoder_out" + } + output_map { + key: "encoder_out_lens" + value: "encoder_out_lens" + } + }, + { + model_name: "frame_reducer" + model_version: -1 + input_map { + key: "x" + value: "encoder_out" + } + input_map { + key: "x_lens" + value: "encoder_out_lens" + } + output_map { + key: "out" + value: "out" + } + output_map { + key: "out_lens" + value: "out_lens" + } + }, + { + model_name: "scorer" + model_version: -1 + input_map { + key: "encoder_out" + value: "out" + } + input_map { + key: "encoder_out_lens" + value: "out_lens" + } + output_map { + key: "OUTPUT0" + value: "TRANSCRIPTS" + } + } + ] +}