Skip to content

Commit

Permalink
Add spoken language identification for node-addon-api (#872)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored May 13, 2024
1 parent 031134b commit 939fdd9
Show file tree
Hide file tree
Showing 13 changed files with 445 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .github/scripts/node-addon/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ fi
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"

# SHERPA_ONNX_VERSION=1.0.20

if [ -z $owner ]; then
owner=k2-fsa
fi
Expand Down
14 changes: 14 additions & 0 deletions .github/scripts/test-nodejs-addon-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@ d=nodejs-addon-examples
echo "dir: $d"
cd $d

echo "----------spoken language identification----------"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.tar.bz2
rm sherpa-onnx-whisper-tiny.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2
tar xvf spoken-language-identification-test-wavs.tar.bz2
rm spoken-language-identification-test-wavs.tar.bz2

node ./test_spoken_language_identification.js
rm -rf sherpa-onnx-whisper-tiny
rm -rf spoken-language-identification-test-wavs

echo "----------streaming asr----------"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/npm-addon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
# SHERPA_ONNX_VERSION=1.0.20
src_dir=.github/scripts/node-addon
sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json
Expand Down
18 changes: 18 additions & 0 deletions nodejs-addon-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,21 @@ rm vits-icefall-zh-aishell3.tar.bz2

node ./test_tts_non_streaming_vits_zh_aishell3.js
```

## Spoken language identification with Whisper multi-lingual models

```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.tar.bz2
rm sherpa-onnx-whisper-tiny.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2
tar xvf spoken-language-identification-test-wavs.tar.bz2
rm spoken-language-identification-test-wavs.tar.bz2

node ./test_spoken_language_identification.js

# To run VAD + spoken language identification using a microphone
npm install naudiodon2
node ./test_vad_spoken_language_identification_microphone.js
```
40 changes: 40 additions & 0 deletions nodejs-addon-examples/test_spoken_language_identification.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)

const sherpa_onnx = require('sherpa-onnx-node');

function createSpokenLanguageID() {
const config = {
whisper: {
encoder: './sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx',
decoder: './sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx',
},
debug: true,
numThreads: 1,
provider: 'cpu',
};
return new sherpa_onnx.SpokenLanguageIdentification(config);
}

const slid = createSpokenLanguageID();

const testWaves = [
'./spoken-language-identification-test-wavs/ar-arabic.wav',
'./spoken-language-identification-test-wavs/de-german.wav',
'./spoken-language-identification-test-wavs/en-english.wav',
'./spoken-language-identification-test-wavs/fr-french.wav',
'./spoken-language-identification-test-wavs/pt-portuguese.wav',
'./spoken-language-identification-test-wavs/es-spanish.wav',
'./spoken-language-identification-test-wavs/zh-chinese.wav',
];

const display = new Intl.DisplayNames(['en'], {type: 'language'})

for (let f of testWaves) {
const stream = slid.createStream();

const wave = sherpa_onnx.readWave(f);
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});

const lang = slid.compute(stream);
console.log(f.split('/')[2], lang, display.of(lang));
}
2 changes: 1 addition & 1 deletion nodejs-addon-examples/test_vad_microphone.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function createVad() {
return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
}

vad = createVad();
const vad = createVad();

const bufferSizeInSeconds = 30;
const buffer =
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)

const portAudio = require('naudiodon2');
// console.log(portAudio.getDevices());

const sherpa_onnx = require('sherpa-onnx-node');

function createVad() {
// please download silero_vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
const config = {
sileroVad: {
model: './silero_vad.onnx',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
windowSize: 512,
},
sampleRate: 16000,
debug: true,
numThreads: 1,
};

const bufferSizeInSeconds = 60;

return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
}

// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
function createSpokenLanguageID() {
const config = {
whisper: {
encoder: './sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx',
decoder: './sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx',
},
debug: true,
numThreads: 1,
provider: 'cpu',
};
return new sherpa_onnx.SpokenLanguageIdentification(config);
}

const slid = createSpokenLanguageID();
const vad = createVad();

const display = new Intl.DisplayNames(['en'], {type: 'language'})

const bufferSizeInSeconds = 30;
const buffer =
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);


const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: vad.config.sampleRate,
}
});

let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
buffer.pop(windowSize);
vad.acceptWaveform(samples)
if (vad.isDetected() && !printed) {
console.log(`${index}: Detected speech`)
printed = true;
}

if (!vad.isDetected()) {
printed = false;
}

while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();

const stream = slid.createStream();
stream.acceptWaveform(
{samples: segment.samples, sampleRate: vad.config.sampleRate});
const lang = slid.compute(stream);
const fullLang = display.of(lang);

const filename = `${index}-${fullLang}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
const duration = segment.samples.length / vad.config.sampleRate;
console.log(`${index} End of speech. Duration: ${
duration} seconds.\n Detected language: ${fullLang}`);
console.log(`Saved to ${filename}`);
index += 1;
}
}
});

ai.on('close', () => {
console.log('Free resources');
});

ai.start();
console.log('Started! Please speak')
1 change: 1 addition & 0 deletions scripts/node-addon-api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(srcs
src/non-streaming-asr.cc
src/non-streaming-tts.cc
src/sherpa-onnx-node-addon-api.cc
src/spoken-language-identification.cc
src/streaming-asr.cc
src/vad.cc
src/wave-reader.cc
Expand Down
1 change: 1 addition & 0 deletions scripts/node-addon-api/lib/non-streaming-asr.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ class OfflineRecognizer {

module.exports = {
OfflineRecognizer,
OfflineStream,
}
2 changes: 2 additions & 0 deletions scripts/node-addon-api/lib/sherpa-onnx.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ const streaming_asr = require('./streaming-asr.js');
const non_streaming_asr = require('./non-streaming-asr.js');
const non_streaming_tts = require('./non-streaming-tts.js');
const vad = require('./vad.js');
const slid = require('./spoken-language-identification.js');

module.exports = {
OnlineRecognizer: streaming_asr.OnlineRecognizer,
Expand All @@ -13,4 +14,5 @@ module.exports = {
Display: streaming_asr.Display,
Vad: vad.Vad,
CircularBuffer: vad.CircularBuffer,
SpokenLanguageIdentification: slid.SpokenLanguageIdentification,
}
30 changes: 30 additions & 0 deletions scripts/node-addon-api/lib/spoken-language-identification.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
const addon = require('./addon.js');
const non_streaming_asr = require('./non-streaming-asr.js');

class SpokenLanguageIdentification {
constructor(config) {
this.handle = addon.createSpokenLanguageIdentification(config);
this.config = config;
}

createStream() {
return new non_streaming_asr.OfflineStream(
addon.createSpokenLanguageIdentificationOfflineStream(this.handle));
}

// return a string containing the language code (2 characters),
// e.g., en, de, fr, es, zh
// en -> English
// de -> German
// fr -> French
// es -> Spanish
// zh -> Chinese
compute(stream) {
return addon.spokenLanguageIdentificationCompute(
this.handle, stream.handle);
}
}

module.exports = {
SpokenLanguageIdentification,
}
3 changes: 3 additions & 0 deletions scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@ void InitWaveReader(Napi::Env env, Napi::Object exports);

void InitWaveWriter(Napi::Env env, Napi::Object exports);

void InitSpokenLanguageID(Napi::Env env, Napi::Object exports);

Napi::Object Init(Napi::Env env, Napi::Object exports) {
InitStreamingAsr(env, exports);
InitNonStreamingAsr(env, exports);
InitNonStreamingTts(env, exports);
InitVad(env, exports);
InitWaveReader(env, exports);
InitWaveWriter(env, exports);
InitSpokenLanguageID(env, exports);

return exports;
}
Expand Down
Loading

0 comments on commit 939fdd9

Please sign in to comment.