Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dart API for speaker diarization #1418

Merged
merged 5 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
first working version
  • Loading branch information
csukuangfj committed Oct 11, 2024
commit fe996e631e52cb30a8fd3155546be761ff1a89b8
5 changes: 5 additions & 0 deletions .github/scripts/test-dart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ set -ex

cd dart-api-examples

pushd speaker-diarization
echo '----------speaker diarization----------'
./run.sh
popd

pushd speaker-identification
echo '----------3d speaker----------'
./run-3d-speaker.sh
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/test-dart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ jobs:
cp scripts/dart/audio-tagging-pubspec.yaml dart-api-examples/audio-tagging/pubspec.yaml
cp scripts/dart/add-punctuations-pubspec.yaml dart-api-examples/add-punctuations/pubspec.yaml
cp scripts/dart/speaker-id-pubspec.yaml dart-api-examples/speaker-identification/pubspec.yaml
cp scripts/dart/speaker-diarization-pubspec.yaml dart-api-examples/speaker-diarization/pubspec.yaml

cp scripts/dart/sherpa-onnx-pubspec.yaml flutter/sherpa_onnx/pubspec.yaml

Expand Down
9 changes: 7 additions & 2 deletions dart-api-examples/speaker-diarization/README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
A sample command-line application with an entrypoint in `bin/`, library code
in `lib/`, and example unit test in `test/`.
# Introduction

This example shows how to use the Dart API from sherpa-onnx for speaker diarization.

# Usage

Please see [./run.sh](./run.sh)
1 change: 1 addition & 0 deletions dart-api-examples/speaker-diarization/bin/init.dart
100 changes: 100 additions & 0 deletions dart-api-examples/speaker-diarization/bin/speaker-diarization.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';
import 'dart:typed_data';
import 'dart:ffi';

import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

/* Please use the following commands to download files used in this file
Step 1: Download a speaker segmentation model

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

Step 2: Download a speaker embedding extractor model

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

Step 3. Download test wave files

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

Step 4. Run it
*/

final segmentationModel =
"./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";

final embeddingModel =
"./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";

final waveFilename = "./0-four-speakers-zh.wav";

final segmentationConfig = sherpa_onnx.OfflineSpeakerSegmentationModelConfig(
pyannote: sherpa_onnx.OfflineSpeakerSegmentationPyannoteModelConfig(
model: segmentationModel),
);

final embeddingConfig =
sherpa_onnx.SpeakerEmbeddingExtractorConfig(model: embeddingModel);

// since we know there are 4 speakers in ./0-four-speakers-zh.wav, we set
// numClusters to 4. If you don't know the exact number, please set it to -1.
// in that case, you have to set threshold. A larger threshold leads to
// fewer clusters, i.e., fewer speakers.
final clusteringConfig =
sherpa_onnx.FastClusteringConfig(numClusters: 4, threshold: 0.5);

var config = sherpa_onnx.OfflineSpeakerDiarizationConfig(
segmentation: segmentationConfig,
embedding: embeddingConfig,
clustering: clusteringConfig,
minDurationOn: 0.2,
minDurationOff: 0.5);

final sd = sherpa_onnx.OfflineSpeakerDiarization(config);
if (sd.ptr == nullptr) {
return;
}

final waveData = sherpa_onnx.readWave(waveFilename);
if (sd.sampleRate != waveData.sampleRate) {
print(
'Expected sample rate: ${sd.sampleRate}, given: ${waveData.sampleRate}');
return;
}

print('started');

// Use the following statement if you don't want to use a callback
// final segments = sd.process(samples: waveData.samples);

final segments = sd.processWithCallback(
samples: waveData.samples,
callback: (int numProcessedChunk, int numTotalChunks) {
final progress = 100.0 * numProcessedChunk / numTotalChunks;

print('Progress ${progress.toStringAsFixed(2)}%');

return 0;
});

for (int i = 0; i < segments.length; ++i) {
print(
'${segments[i].start.toStringAsFixed(3)} -- ${segments[i].end.toStringAsFixed(3)} speaker_${segments[i].speaker}');
}
}

This file was deleted.

14 changes: 8 additions & 6 deletions dart-api-examples/speaker-diarization/pubspec.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
name: speaker_diarization
description: A sample command-line application.
description: >
This example demonstrates how to use the Dart API for speaker diarization.

version: 1.0.0
# repository: https://github.com/my_org/my_repo

environment:
sdk: ^3.4.0
sdk: ">=3.0.0 <4.0.0"

# Add regular dependencies here.
dependencies:
# path: ^1.8.0
sherpa_onnx: ^1.10.27
# sherpa_onnx:
# path: ../../flutter/sherpa_onnx
path: ^1.9.0

dev_dependencies:
lints: ^3.0.0
test: ^1.24.0
21 changes: 21 additions & 0 deletions dart-api-examples/speaker-diarization/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

set -ex

dart pub get

if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi

if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi

if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi

dart run ./bin/speaker-diarization.dart
1 change: 1 addition & 0 deletions flutter/sherpa_onnx/example/example.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

| Functions | URL | Supported Platforms|
|---|---|---|
|Speaker diarization| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/speaker-diarization)| macOS, Windows, Linux|
|Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/streaming-asr)| macOS, Windows, Linux|
|Non-Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/non-streaming-asr)| macOS, Windows, Linux|
|Text to speech| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/tts)| macOS, Windows, Linux|
Expand Down
1 change: 1 addition & 0 deletions flutter/sherpa_onnx/lib/sherpa_onnx.dart
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export 'src/audio_tagging.dart';
export 'src/feature_config.dart';
export 'src/keyword_spotter.dart';
export 'src/offline_recognizer.dart';
export 'src/offline_speaker_diarization.dart';
export 'src/offline_stream.dart';
export 'src/online_recognizer.dart';
export 'src/online_stream.dart';
Expand Down
81 changes: 66 additions & 15 deletions flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:convert';
import 'dart:ffi';
import 'dart:typed_data';

import 'package:ffi/ffi.dart';

Expand All @@ -9,7 +10,7 @@ import './speaker_identification.dart';
import './utils.dart';

class OfflineSpeakerDiarizationSegment {
OfflineSpeakerSegmentationSegment({
const OfflineSpeakerDiarizationSegment({
required this.start,
required this.end,
required this.speaker,
Expand Down Expand Up @@ -76,7 +77,7 @@ class FastClusteringConfig {
class OfflineSpeakerDiarizationConfig {
const OfflineSpeakerDiarizationConfig({
this.segmentation = const OfflineSpeakerSegmentationModelConfig(),
this.embedding = const SpeakerEmbeddingExtractorConfig(),
this.embedding = const SpeakerEmbeddingExtractorConfig(model: ''),
this.clustering = const FastClusteringConfig(),
this.minDurationOn = 0.2,
this.minDurationOff = 0.5,
Expand All @@ -95,7 +96,8 @@ class OfflineSpeakerDiarizationConfig {
}

class OfflineSpeakerDiarization {
OfflineSpeakerDiarization._({required this.ptr, required this.config});
OfflineSpeakerDiarization._(
{required this.ptr, required this.config, required this.sampleRate});

void free() {
SherpaOnnxBindings.sherpaOnnxDestroyOfflineSpeakerDiarization?.call(ptr);
Expand Down Expand Up @@ -125,25 +127,29 @@ class OfflineSpeakerDiarization {
c.ref.minDurationOff = config.minDurationOff;

final ptr =
SherpaOnnxBindings.SherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ??
SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ??
nullptr;

calloc.free(c.ref.embedding.provider);
calloc.free(c.ref.embedding.model);
calloc.free(c.ref.segmentation.provider);
calloc.free(c.ref.segmentation.pyannote.model);

final sampleRate = SherpaOnnxBindings
.SherpaOnnxOfflineSpeakerDiarizationGetSampleRate?.call(ptr) ??
0;
int sampleRate = 0;
if (ptr != nullptr) {
sampleRate = SherpaOnnxBindings
.sherpaOnnxOfflineSpeakerDiarizationGetSampleRate
?.call(ptr) ??
0;
}
return OfflineSpeakerDiarization._(
ptr: ptr, config: config, sampleRate: sampleRate);
}

List<OfflineSpeakerDiarizationSegment> process(
{required samples: Float32List}) {
{required Float32List samples}) {
if (ptr == nullptr) {
return <OfflineSpeakerSegmentationSegment>[];
return <OfflineSpeakerDiarizationSegment>[];
}

final n = samples.length;
Expand All @@ -156,8 +162,53 @@ class OfflineSpeakerDiarization {
?.call(ptr, p, n) ??
nullptr;

final ans = _processImpl(r);

SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult
?.call(r);

return ans;
}

List<OfflineSpeakerDiarizationSegment> processWithCallback({
required Float32List samples,
required int Function(int numProcessedChunks, int numTotalChunks) callback,
}) {
if (ptr == nullptr) {
return <OfflineSpeakerDiarizationSegment>[];
}

final n = samples.length;
final Pointer<Float> p = calloc<Float>(n);

final pList = p.asTypedList(n);
pList.setAll(0, samples);

final wrapper = NativeCallable<
SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>.isolateLocal(
(int numProcessedChunks, int numTotalChunks) {
return callback(numProcessedChunks, numTotalChunks);
}, exceptionalReturn: 0);

final r = SherpaOnnxBindings
.sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg
?.call(ptr, p, n, wrapper.nativeFunction) ??
nullptr;

wrapper.close();

final ans = _processImpl(r);

SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult
?.call(r);

return ans;
}

List<OfflineSpeakerDiarizationSegment> _processImpl(
Pointer<SherpaOnnxOfflineSpeakerDiarizationResult> r) {
if (r == nullptr) {
return <OfflineSpeakerSegmentationSegment>[];
return <OfflineSpeakerDiarizationSegment>[];
}

final numSegments = SherpaOnnxBindings
Expand All @@ -170,20 +221,20 @@ class OfflineSpeakerDiarization {
nullptr;

if (segments == nullptr) {
return <OfflineSpeakerSegmentationSegment>[];
return <OfflineSpeakerDiarizationSegment>[];
}

final ans = <String>[];
final ans = <OfflineSpeakerDiarizationSegment>[];
for (int i = 0; i != numSegments; ++i) {
final s = segments.elementAt(i);

final tmp = OfflineSpeakerSegmentationSegment(
final tmp = OfflineSpeakerDiarizationSegment(
start: s.ref.start, end: s.ref.end, speaker: s.ref.speaker);
ans.add(tmp);
}

SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult
?.call(r);
SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroySegment
?.call(segments);

return ans;
}
Expand Down
Loading