Skip to content

Commit

Permalink
Add submitUserSpeechOnPause option (#63)
Browse files Browse the repository at this point in the history
Co-authored-by: antoine.lizee <antoine.lizee@gmail.com>
Co-authored-by: ricky <rickycontact9@gmail.com>
  • Loading branch information
3 people authored Dec 15, 2023
1 parent b9a5cd2 commit d34fcd6
Show file tree
Hide file tree
Showing 12 changed files with 222 additions and 12 deletions.
1 change: 0 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 13 additions & 1 deletion packages/_common/src/frame-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ export interface FrameProcessorOptions {
* it will be discarded and `onVADMisfire` will be run instead of `onSpeechEnd`.
*/
minSpeechFrames: number

/**
* If true, when the user pauses the VAD, it may trigger `onSpeechEnd`.
*/
submitUserSpeechOnPause: boolean
}

export const defaultFrameProcessorOptions: FrameProcessorOptions = {
Expand All @@ -51,6 +56,7 @@ export const defaultFrameProcessorOptions: FrameProcessorOptions = {
redemptionFrames: 8,
frameSamples: 1536,
minSpeechFrames: 3,
submitUserSpeechOnPause: false,
}

export function validateOptions(options: FrameProcessorOptions) {
Expand Down Expand Up @@ -131,7 +137,12 @@ export class FrameProcessor implements FrameProcessorInterface {

pause = () => {
this.active = false
this.reset()
if (this.options.submitUserSpeechOnPause) {
return this.endSegment()
} else {
this.reset()
return {}
}
}

resume = () => {
Expand Down Expand Up @@ -163,6 +174,7 @@ export class FrameProcessor implements FrameProcessorInterface {
if (!this.active) {
return {}
}

const probs = await this.modelProcessFunc(frame)
this.audioBuffer.push({
frame,
Expand Down
1 change: 1 addition & 0 deletions packages/_common/src/non-real-time-vad.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ export class PlatformAgnosticNonRealTimeVAD {
redemptionFrames: this.options.redemptionFrames,
preSpeechPadFrames: this.options.preSpeechPadFrames,
minSpeechFrames: this.options.minSpeechFrames,
submitUserSpeechOnPause: this.options.submitUserSpeechOnPause,
})
this.frameProcessor.resume()
}
Expand Down
4 changes: 2 additions & 2 deletions packages/react/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
"react"
],
"homepage": "https://github.com/ricky0123/vad",
"version": "0.0.19",
"version": "0.0.20",
"license": "ISC",
"main": "dist/index.js",
"devDependencies": {
"@types/react": "^18.0.28"
},
"dependencies": {
"onnxruntime-web": "^1.14.0",
"@ricky0123/vad-web": "^0.0.14"
"@ricky0123/vad-web": "^0.0.15"
},
"peerDependencies": {
"react": "^18",
Expand Down
2 changes: 1 addition & 1 deletion packages/web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"offline-speech-recognition"
],
"homepage": "https://github.com/ricky0123/vad",
"version": "0.0.14",
"version": "0.0.15",
"license": "ISC",
"main": "dist/index.js",
"unpkg": "dist/bundle.min.js",
Expand Down
24 changes: 18 additions & 6 deletions packages/web/src/real-time-vad.ts
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ export class AudioNodeVAD {
redemptionFrames: fullOptions.redemptionFrames,
preSpeechPadFrames: fullOptions.preSpeechPadFrames,
minSpeechFrames: fullOptions.minSpeechFrames,
submitUserSpeechOnPause: fullOptions.submitUserSpeechOnPause,
}
)

Expand Down Expand Up @@ -217,7 +218,8 @@ export class AudioNodeVAD {
) {}

pause = () => {
this.frameProcessor.pause()
const ev = this.frameProcessor.pause()
this.handleFrameProcessorEvent(ev)
}

start = () => {
Expand All @@ -229,11 +231,21 @@ export class AudioNodeVAD {
}

processFrame = async (frame: Float32Array) => {
const { probs, msg, audio } = await this.frameProcessor.process(frame)
if (probs !== undefined) {
this.options.onFrameProcessed(probs)
const ev = await this.frameProcessor.process(frame)
this.handleFrameProcessorEvent(ev)
}

handleFrameProcessorEvent = (
ev: Partial<{
probs: SpeechProbabilities
msg: Message
audio: Float32Array
}>
) => {
if (ev.probs !== undefined) {
this.options.onFrameProcessed(ev.probs)
}
switch (msg) {
switch (ev.msg) {
case Message.SpeechStart:
this.options.onSpeechStart()
break
Expand All @@ -243,7 +255,7 @@ export class AudioNodeVAD {
break

case Message.SpeechEnd:
this.options.onSpeechEnd(audio as Float32Array)
this.options.onSpeechEnd(ev.audio as Float32Array)
break

default:
Expand Down
3 changes: 2 additions & 1 deletion scripts/set-example-deps-local.ipy
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ sources = {

for example_dir in Path("examples").glob("*"):
for pkg, src in sources.items():
dep_path =example_dir/f"node_modules/{pkg}"
dep_path = example_dir/f"node_modules/{pkg}"
if dep_path.exists():
print("Updating", dep_path)
package_json_src = f"{dep_path}/package.json"
package_json_tgt = f"{src}/package.json"

Expand Down
2 changes: 2 additions & 0 deletions test-site/src/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ <h1 class="title">Welcome to the VAD test site</h1>
<li><a href="web-basic">Basic vad-web</a></li>
<li><a href="react-basic">Basic vad-react</a></li>
<li><a href="react-destroy">React destroy</a></li>
<li><a href="submit-user-speech-on-pause">Submit user speech on pause</a></li>
<li><a href="react-submit-user-speech-on-pause">React submit user speech on pause</a></li>
</ul>
</div>
</div>
Expand Down
16 changes: 16 additions & 0 deletions test-site/src/react-submit-user-speech-on-pause/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html>
<head>
<title>VAD test site</title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.3/css/bulma.min.css"
/>
<script defer src="index.js"></script>
</head>
<body>
<div id="root"></div>
</body>
</html>
73 changes: 73 additions & 0 deletions test-site/src/react-submit-user-speech-on-pause/index.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// @ts-nocheck

import React, { useReducer, useState } from "react"
import * as ort from "onnxruntime-web"
import { createRoot } from "react-dom/client"
import { useMicVAD, utils } from "@ricky0123/vad-react"

ort.env.wasm.wasmPaths = {
"ort-wasm-simd-threaded.wasm": "/ort-wasm-simd-threaded.wasm",
"ort-wasm-simd.wasm": "/ort-wasm-simd.wasm",
"ort-wasm.wasm": "/ort-wasm.wasm",
"ort-wasm-threaded.wasm": "/ort-wasm-threaded.wasm",
}

const domContainer = document.querySelector("#root")
const root = createRoot(domContainer)
root.render(<App />)

function App() {
const [audioList, setAudioList] = useState([])
const vad = useMicVAD({
submitUserSpeechOnPause: true,
workletURL: "http://localhost:8080/vad.worklet.bundle.min.js",
modelURL: "http://localhost:8080/silero_vad.onnx",
onVADMisfire: () => {
console.log("Vad misfire")
},
onSpeechStart: () => {
console.log("Speech start")
},
onSpeechEnd: (audio) => {
console.log("Speech end")
const wavBuffer = utils.encodeWAV(audio)
const base64 = utils.arrayBufferToBase64(wavBuffer)
const url = `data:audio/wav;base64,${base64}`
setAudioList((old) => [url, ...old])
},
})
return (
<section className="section">
<div className="container">
<h1 className="title">Basic vad-react functionality</h1>

<div className="block is-inline-flex">
<button
className={
vad.loading ? "button is-primary is-loading" : "button is-primary"
}
onClick={() => {
console.log("run toggle vad")
vad.toggle()
}}
disabled={vad.loading}
>
Toggle VAD
</button>
</div>

<div className="block">
<ul>
{audioList.map((audioURL) => {
return (
<li key={audioURL.substring(-10)}>
<audio controls src={audioURL} />
</li>
)
})}
</ul>
</div>
</div>
</section>
)
}
35 changes: 35 additions & 0 deletions test-site/src/submit-user-speech-on-pause/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<!DOCTYPE html>
<html>
<head>
<title>VAD test site</title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.3/css/bulma.min.css"
/>
<script type="module" src="index.js"></script>
</head>
<body>
<section class="section">
<div class="container">
<h1 class="title">Basic vad-web functionality</h1>

<div class="block is-inline-flex">
<button
id="toggleVAD"
class="button is-primary is-loading"
onclick="window.toggleVAD()"
disabled
>
Start VAD
</button>
</div>

<div class="block">
<ul id="audio-list"></ul>
</div>
</div>
</section>
</body>
</html>
59 changes: 59 additions & 0 deletions test-site/src/submit-user-speech-on-pause/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// @ts-nocheck

import * as vad from "@ricky0123/vad-web"

function getToggleButton() {
return document.getElementById("toggleVAD")
}

async function main() {
try {
const myvad = await vad.MicVAD.new({
submitUserSpeechOnPause: true,
workletURL: "http://localhost:8080/vad.worklet.bundle.min.js",
modelURL: "http://localhost:8080/silero_vad.onnx",
onSpeechStart: () => {
console.log("Speech start")
},
onSpeechEnd: (arr) => {
console.log("Speech end")
const wavBuffer = vad.utils.encodeWAV(arr)
const base64 = vad.utils.arrayBufferToBase64(wavBuffer)
const url = `data:audio/wav;base64,${base64}`
const el = addAudio(url)
const speechList = document.getElementById("audio-list")
speechList.prepend(el)
},
})

window.myvad = myvad
getToggleButton().classList.remove("is-loading")

window.toggleVAD = () => {
if (myvad.listening === false) {
console.log("run start vad")
myvad.start()
getToggleButton().textContent = "Stop VAD"
} else {
console.log("run pause vad")
myvad.pause()
getToggleButton().textContent = "Start VAD"
}
}
window.toggleVAD()
getToggleButton().disabled = false
} catch (e) {
console.error("Failed:", e)
}

function addAudio(audioUrl) {
const entry = document.createElement("li")
const audio = document.createElement("audio")
audio.controls = true
audio.src = audioUrl
entry.appendChild(audio)
return entry
}
}

main()

0 comments on commit d34fcd6

Please sign in to comment.