Skip to content

Commit

Permalink
Add speaker identification APIs for node-addon-api (#874)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored May 14, 2024
1 parent 0895b64 commit 388e6a9
Show file tree
Hide file tree
Showing 16 changed files with 1,034 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/scripts/node-addon/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fi
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"

# SHERPA_ONNX_VERSION=1.0.20
# SHERPA_ONNX_VERSION=1.0.21

if [ -z $owner ]; then
owner=k2-fsa
Expand Down
10 changes: 10 additions & 0 deletions .github/scripts/test-nodejs-addon-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@ d=nodejs-addon-examples
echo "dir: $d"
cd $d

echo "----------speaker identification----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

git clone https://github.com/csukuangfj/sr-data

node ./test_speaker_identification.js

rm *.onnx
rm -rf sr-data

echo "----------spoken language identification----------"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/npm-addon-linux-aarch64.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: npm-addon-linux-aarch64

on:
push:
branches:
- node-addon
workflow_dispatch:

concurrency:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/npm-addon-linux-x64.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: npm-addon-linux-x64

on:
push:
branches:
- node-addon
workflow_dispatch:

concurrency:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/npm-addon-macos.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: npm-addon-macos

on:
push:
branches:
- node-addon
workflow_dispatch:

concurrency:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/npm-addon-win-x64.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: npm-addon-win-x64

on:
push:
branches:
- node-addon
workflow_dispatch:

concurrency:
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/npm-addon.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: npm-addon

on:
push:
branches:
- node-addon
workflow_dispatch:

concurrency:
Expand Down Expand Up @@ -52,7 +55,7 @@ jobs:
SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
# SHERPA_ONNX_VERSION=1.0.20
# SHERPA_ONNX_VERSION=1.0.21
src_dir=.github/scripts/node-addon
sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json
Expand Down
13 changes: 13 additions & 0 deletions nodejs-addon-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,16 @@ node ./test_spoken_language_identification.js
npm install naudiodon2
node ./test_vad_spoken_language_identification_microphone.js
```

## Speaker identification

You can find more models at
<https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models>

```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

git clone https://github.com/csukuangfj/sr-data

node ./test_speaker_identification.js
```
1 change: 0 additions & 1 deletion nodejs-addon-examples/test_asr_non_streaming_nemo_ctc.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
const sherpa_onnx = require('sherpa-onnx-node');
const performance = require('perf_hooks').performance;


// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const config = {
Expand Down
102 changes: 102 additions & 0 deletions nodejs-addon-examples/test_speaker_identification.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright (c) 2024 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');
const assert = require('node:assert');

// Please download models files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
function createSpeakerEmbeddingExtractor() {
const config = {
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
numThreads: 1,
debug: true,
};
return new sherpa_onnx.SpeakerEmbeddingExtractor(config);
}

function computeEmbedding(extractor, filename) {
const stream = extractor.createStream();
const wave = sherpa_onnx.readWave(filename);
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
return extractor.compute(stream);
}

const extractor = createSpeakerEmbeddingExtractor();
const manager = new sherpa_onnx.SpeakerEmbeddingManager(extractor.dim);

// Please download test files from
// https://github.com/csukuangfj/sr-data
const spk1Files = [
'./sr-data/enroll/fangjun-sr-1.wav',
'./sr-data/enroll/fangjun-sr-2.wav',
'./sr-data/enroll/fangjun-sr-3.wav',
];

let spk1Vec = [];
for (let f of spk1Files) {
spk1Vec.push(computeEmbedding(extractor, f));
}

const spk2Files = [
'./sr-data/enroll/leijun-sr-1.wav',
'./sr-data/enroll/leijun-sr-2.wav',
];

let spk2Vec = [];
for (let f of spk2Files) {
spk2Vec.push(computeEmbedding(extractor, f));
}

let ok = manager.addMulti({name: 'fangjun', v: spk1Vec});
assert.equal(ok, true);

ok = manager.addMulti({name: 'leijun', v: spk2Vec});
assert.equal(ok, true);

assert.equal(manager.getNumSpeakers(), 2);

assert.equal(manager.contains('fangjun'), true);
assert.equal(manager.contains('leijun'), true);

console.log('---All speakers---');

console.log(manager.getAllSpeakerNames());
console.log('------------');

const testFiles = [
'./sr-data/test/fangjun-test-sr-1.wav',
'./sr-data/test/leijun-test-sr-1.wav',
'./sr-data/test/liudehua-test-sr-1.wav',
];

const threshold = 0.6;

for (let f of testFiles) {
const embedding = computeEmbedding(extractor, f);

let name = manager.search({v: embedding, threshold: threshold});
if (name == '') {
name = '<Unknown>';
}
console.log(`${f}: ${name}`);
}


ok = manager.verify({
name: 'fangjun',
v: computeEmbedding(extractor, testFiles[0]),
threshold: threshold
});

assert.equal(ok, true);

ok = manager.remove('fangjun');
assert.equal(ok, true);

ok = manager.verify({
name: 'fangjun',
v: computeEmbedding(extractor, testFiles[0]),
threshold: threshold
});
assert.equal(ok, false);

assert.equal(manager.getNumSpeakers(), 1);
1 change: 1 addition & 0 deletions scripts/node-addon-api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(srcs
src/non-streaming-asr.cc
src/non-streaming-tts.cc
src/sherpa-onnx-node-addon-api.cc
src/speaker-identification.cc
src/spoken-language-identification.cc
src/streaming-asr.cc
src/vad.cc
Expand Down
3 changes: 3 additions & 0 deletions scripts/node-addon-api/lib/sherpa-onnx.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const non_streaming_asr = require('./non-streaming-asr.js');
const non_streaming_tts = require('./non-streaming-tts.js');
const vad = require('./vad.js');
const slid = require('./spoken-language-identification.js');
const sid = require('./speaker-identification.js');

module.exports = {
OnlineRecognizer: streaming_asr.OnlineRecognizer,
Expand All @@ -15,4 +16,6 @@ module.exports = {
Vad: vad.Vad,
CircularBuffer: vad.CircularBuffer,
SpokenLanguageIdentification: slid.SpokenLanguageIdentification,
SpeakerEmbeddingExtractor: sid.SpeakerEmbeddingExtractor,
SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager,
}
102 changes: 102 additions & 0 deletions scripts/node-addon-api/lib/speaker-identification.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
const addon = require('./addon.js');
const streaming_asr = require('./streaming-asr.js');

class SpeakerEmbeddingExtractor {
constructor(config) {
this.handle = addon.createSpeakerEmbeddingExtractor(config);
this.config = config;
this.dim = addon.speakerEmbeddingExtractorDim(this.handle);
}

createStream() {
return new streaming_asr.OnlineStream(
addon.speakerEmbeddingExtractorCreateStream(this.handle));
}

isReady(stream) {
return addon.speakerEmbeddingExtractorIsReady(this.handle, stream.handle);
}

// return a float32 array
compute(stream) {
return addon.speakerEmbeddingExtractorComputeEmbedding(
this.handle, stream.handle);
}
}

function flatten(arrayList) {
let n = 0;
for (let i = 0; i < arrayList.length; ++i) {
n += arrayList[i].length;
}
let ans = new Float32Array(n);

let offset = 0;
for (let i = 0; i < arrayList.length; ++i) {
ans.set(arrayList[i], offset);
offset += arrayList[i].length;
}
return ans;
}

class SpeakerEmbeddingManager {
constructor(dim) {
this.handle = addon.createSpeakerEmbeddingManager(dim);
this.dim = dim;
}

/*
obj = {name: "xxx", v: a-float32-array}
*/
add(obj) {
return addon.speakerEmbeddingManagerAdd(this.handle, obj);
}

/*
* obj =
* {name: "xxx", v: [float32_array1, float32_array2, ..., float32_arrayn]
*/
addMulti(obj) {
const c = {
name: obj.name,
vv: flatten(obj.v),
n: obj.v.length,
};
return addon.speakerEmbeddingManagerAddListFlattened(this.handle, c);
}

remove(name) {
return addon.speakerEmbeddingManagerRemove(this.handle, name);
}

/*
* obj = {v: a-float32-array, threshold: a-float }
*/
search(obj) {
return addon.speakerEmbeddingManagerSearch(this.handle, obj);
}

/*
* obj = {name: 'xxx', v: a-float32-array, threshold: a-float }
*/
verify(obj) {
return addon.speakerEmbeddingManagerVerify(this.handle, obj);
}

contains(name) {
return addon.speakerEmbeddingManagerContains(this.handle, name);
}

getNumSpeakers() {
return addon.speakerEmbeddingManagerNumSpeakers(this.handle);
}

getAllSpeakerNames() {
return addon.speakerEmbeddingManagerGetAllSpeakers(this.handle);
}
}

module.exports = {
SpeakerEmbeddingExtractor,
SpeakerEmbeddingManager,
}
1 change: 1 addition & 0 deletions scripts/node-addon-api/lib/streaming-asr.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,6 @@ class OnlineRecognizer {

module.exports = {
OnlineRecognizer,
OnlineStream,
Display
}
3 changes: 3 additions & 0 deletions scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ void InitWaveWriter(Napi::Env env, Napi::Object exports);

void InitSpokenLanguageID(Napi::Env env, Napi::Object exports);

void InitSpeakerID(Napi::Env env, Napi::Object exports);

Napi::Object Init(Napi::Env env, Napi::Object exports) {
InitStreamingAsr(env, exports);
InitNonStreamingAsr(env, exports);
Expand All @@ -25,6 +27,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
InitWaveReader(env, exports);
InitWaveWriter(env, exports);
InitSpokenLanguageID(env, exports);
InitSpeakerID(env, exports);

return exports;
}
Expand Down
Loading

0 comments on commit 388e6a9

Please sign in to comment.