Skip to content

Commit bba5ea7

Browse files
committed
working resampling mechanism
1 parent 4e93b12 commit bba5ea7

File tree

3 files changed

+44
-91
lines changed

3 files changed

+44
-91
lines changed

DEVELOPMENT.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,31 @@ To see how the agent work open up agents.py
109109
* The LLM uses the VideoForwarder to write the video to a websocket or webrtc connection
110110
* The STS writes the reply on agent.llm.audio_track and the RealtimeTranscriptEvent / RealtimePartialTranscriptEvent
111111

112+
## Audio management
113+
114+
Some important things about audio inside the library:
115+
116+
1. WebRTC uses Opus 48khz stereo but inside the library audio is always in PCM format
117+
2. Plugins / AI models work with different PCM formats, usually 16khz mono
118+
3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels and format
119+
4. Text-to-speech plugins automatically return PCM in the format needed by WebRTC. This is exposed via the `set_output_format` method
120+
5. Audio resampling can be done using `PcmData.resample` method
121+
6. When resampling audio in chunks, it is important to re-use the same `av.AudioResampler` resampler (see `PcmData.resample` and `core.tts.TTS`)
122+
7. Adjusting from stereo to mono and vice-versa can be done using the `PcmData.resample` method
123+
124+
Some ground rules:
125+
126+
1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
127+
2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead
128+
129+
### Testing audio manually
130+
131+
Sometimes you need to test audio manually, here's some tips:
132+
133+
1. Do not use earplugs when testing PCM playback ;)
134+
2. You can use the `PcmData.to_wav_bytes` method to convert PCM into wav bytes (see `manual_tts_to_wav` for an example)
135+
3. If you have `ffplay` installed, you can playback pcm directly to check if audio is correct
136+
112137
## Dev / Contributor Guidelines
113138

114139
### Light wrapping

agents-core/vision_agents/core/edge/types.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -287,14 +287,20 @@ def from_data(
287287
raise TypeError(f"Unsupported data type for PcmData: {type(data)}")
288288

289289
def resample(
290-
self, target_sample_rate: int, target_channels: Optional[int] = None
290+
self,
291+
target_sample_rate: int,
292+
target_channels: Optional[int] = None,
293+
resampler: Optional[Any] = None,
291294
) -> "PcmData":
292295
"""
293296
Resample PcmData to a different sample rate and/or channels using AV library.
294297
295298
Args:
296299
target_sample_rate: Target sample rate in Hz
297300
target_channels: Target number of channels (defaults to current)
301+
resampler: Optional persistent AudioResampler instance to use. If None,
302+
creates a new resampler (for one-off use). Pass a persistent
303+
resampler to avoid discontinuities when resampling streaming chunks.
298304
299305
Returns:
300306
New PcmData object with resampled audio
@@ -336,11 +342,13 @@ def resample(
336342
frame = av.AudioFrame.from_ndarray(cmaj, format="s16p", layout=in_layout)
337343
frame.sample_rate = self.sample_rate
338344

339-
# Create resampler – output packed s16
340-
out_layout = "mono" if target_channels == 1 else "stereo"
341-
resampler = av.AudioResampler(
342-
format="s16", layout=out_layout, rate=target_sample_rate
343-
)
345+
# Use provided resampler or create a new one
346+
if resampler is None:
347+
# Create new resampler for one-off use
348+
out_layout = "mono" if target_channels == 1 else "stereo"
349+
resampler = av.AudioResampler(
350+
format="s16", layout=out_layout, rate=target_sample_rate
351+
)
344352

345353
# Resample the frame
346354
resampled_frames = resampler.resample(frame)

agents-core/vision_agents/core/tts/tts.py

Lines changed: 5 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -182,91 +182,11 @@ def _emit_chunk(
182182
user: Optional[Dict[str, Any]],
183183
) -> tuple[int, float]:
184184
"""Resample, serialize, emit TTSAudioEvent; return (bytes_len, duration_ms)."""
185-
186-
if (
187-
pcm.sample_rate == self._desired_sample_rate
188-
and pcm.channels == self._desired_channels
189-
):
190-
# No resampling needed
191-
pcm_out = pcm
192-
else:
193-
resampler = self._get_resampler(pcm.sample_rate, pcm.channels)
194-
195-
# Prepare input frame in planar format
196-
samples = pcm.samples
197-
if isinstance(samples, np.ndarray):
198-
if samples.ndim == 1:
199-
if pcm.channels > 1:
200-
cmaj = np.tile(samples, (pcm.channels, 1))
201-
else:
202-
cmaj = samples.reshape(1, -1)
203-
elif samples.ndim == 2:
204-
ch = pcm.channels if pcm.channels else 1
205-
if samples.shape[0] == ch:
206-
cmaj = samples
207-
elif samples.shape[1] == ch:
208-
cmaj = samples.T
209-
else:
210-
if samples.shape[1] > samples.shape[0]:
211-
cmaj = samples
212-
else:
213-
cmaj = samples.T
214-
cmaj = np.ascontiguousarray(cmaj)
215-
else:
216-
# Shouldn't happen, but handle it
217-
cmaj = (
218-
samples.reshape(1, -1)
219-
if isinstance(samples, np.ndarray)
220-
else samples
221-
)
222-
223-
in_layout = "mono" if pcm.channels == 1 else "stereo"
224-
frame = av.AudioFrame.from_ndarray(cmaj, format="s16p", layout=in_layout)
225-
frame.sample_rate = pcm.sample_rate
226-
227-
# Resample using persistent resampler
228-
resampled_frames = resampler.resample(frame)
229-
230-
if resampled_frames:
231-
resampled_frame = resampled_frames[0]
232-
raw_array = resampled_frame.to_ndarray()
233-
num_frames = resampled_frame.samples
234-
235-
# Handle PyAV's packed format quirk
236-
ch = self._desired_channels
237-
if raw_array.ndim == 2 and raw_array.shape[0] == 1 and ch > 1:
238-
flat = raw_array.reshape(-1)
239-
if len(flat) == num_frames * ch:
240-
resampled_samples = flat.reshape(-1, ch).T
241-
else:
242-
resampled_samples = flat.reshape(ch, -1)
243-
elif raw_array.ndim == 2:
244-
if raw_array.shape[1] == ch:
245-
resampled_samples = raw_array.T
246-
elif raw_array.shape[0] == ch:
247-
resampled_samples = raw_array
248-
else:
249-
resampled_samples = raw_array.T
250-
elif raw_array.ndim == 1:
251-
if ch == 1:
252-
resampled_samples = raw_array
253-
else:
254-
resampled_samples = np.tile(raw_array, (ch, 1))
255-
else:
256-
resampled_samples = raw_array.reshape(ch, -1)
257-
258-
if resampled_samples.dtype != np.int16:
259-
resampled_samples = resampled_samples.astype(np.int16)
260-
261-
pcm_out = PcmData(
262-
samples=resampled_samples,
263-
sample_rate=self._desired_sample_rate,
264-
format="s16",
265-
channels=self._desired_channels,
266-
)
267-
else:
268-
# Resampling failed, use original
269-
pcm_out = pcm
185+
# Resample using persistent resampler to avoid discontinuities between chunks
186+
resampler = self._get_resampler(pcm.sample_rate, pcm.channels)
187+
pcm_out = pcm.resample(
188+
self._desired_sample_rate, self._desired_channels, resampler=resampler
189+
)
270190

271191
payload = pcm_out.to_bytes()
272192
# Metrics: counters per chunk

0 commit comments

Comments
 (0)