working resampling mechanism

tbarbugli · tbarbugli · commit bba5ea7af750 · 2025-10-24T21:58:00.000+02:00
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -109,6 +109,31 @@ To see how the agent work open up agents.py
 * The LLM uses the VideoForwarder to write the video to a websocket or webrtc connection
 * The STS writes the reply on agent.llm.audio_track and the RealtimeTranscriptEvent / RealtimePartialTranscriptEvent
 
+## Audio management
+
+Some important things about audio inside the library:
+
+1. WebRTC uses Opus 48khz stereo but inside the library audio is always in PCM format
+2. Plugins / AI models work with different PCM formats, usually 16khz mono
+3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels and format
+4. Text-to-speech plugins automatically return PCM in the format needed by WebRTC. This is exposed via the `set_output_format` method
+5. Audio resampling can be done using `PcmData.resample` method
+6. When resampling audio in chunks, it is important to re-use the same `av.AudioResampler` resampler (see `PcmData.resample` and `core.tts.TTS`)
+7. Adjusting from stereo to mono and vice-versa can be done using the `PcmData.resample` method
+
+Some ground rules:
+
+1. Do not build code to resample / adjust audio unless it is not covered already by `PcmData`
+2. Do not pass PCM as plain bytes around and write code that assumes specific sample rate or format. Use `PcmData` instead
+
+### Testing audio manually
+
+Sometimes you need to test audio manually, here's some tips:
+
+1. Do not use earplugs when testing PCM playback ;)
+2. You can use the `PcmData.to_wav_bytes` method to convert PCM into wav bytes (see `manual_tts_to_wav` for an example)
+3. If you have `ffplay` installed, you can playback pcm directly to check if audio is correct
+
 ## Dev / Contributor Guidelines
 
 ### Light wrapping
diff --git a/agents-core/vision_agents/core/edge/types.py b/agents-core/vision_agents/core/edge/types.py
@@ -287,14 +287,20 @@ def from_data(
         raise TypeError(f"Unsupported data type for PcmData: {type(data)}")
 
     def resample(
-        self, target_sample_rate: int, target_channels: Optional[int] = None
+        self,
+        target_sample_rate: int,
+        target_channels: Optional[int] = None,
+        resampler: Optional[Any] = None,
     ) -> "PcmData":
         """
         Resample PcmData to a different sample rate and/or channels using AV library.
 
         Args:
             target_sample_rate: Target sample rate in Hz
             target_channels: Target number of channels (defaults to current)
+            resampler: Optional persistent AudioResampler instance to use. If None,
+                      creates a new resampler (for one-off use). Pass a persistent
+                      resampler to avoid discontinuities when resampling streaming chunks.
 
         Returns:
             New PcmData object with resampled audio
@@ -336,11 +342,13 @@ def resample(
         frame = av.AudioFrame.from_ndarray(cmaj, format="s16p", layout=in_layout)
         frame.sample_rate = self.sample_rate
 
-        # Create resampler – output packed s16
-        out_layout = "mono" if target_channels == 1 else "stereo"
-        resampler = av.AudioResampler(
-            format="s16", layout=out_layout, rate=target_sample_rate
-        )
+        # Use provided resampler or create a new one
+        if resampler is None:
+            # Create new resampler for one-off use
+            out_layout = "mono" if target_channels == 1 else "stereo"
+            resampler = av.AudioResampler(
+                format="s16", layout=out_layout, rate=target_sample_rate
+            )
 
         # Resample the frame
         resampled_frames = resampler.resample(frame)
diff --git a/agents-core/vision_agents/core/tts/tts.py b/agents-core/vision_agents/core/tts/tts.py
@@ -182,91 +182,11 @@ def _emit_chunk(
         user: Optional[Dict[str, Any]],
     ) -> tuple[int, float]:
         """Resample, serialize, emit TTSAudioEvent; return (bytes_len, duration_ms)."""
-
-        if (
-            pcm.sample_rate == self._desired_sample_rate
-            and pcm.channels == self._desired_channels
-        ):
-            # No resampling needed
-            pcm_out = pcm
-        else:
-            resampler = self._get_resampler(pcm.sample_rate, pcm.channels)
-
-            # Prepare input frame in planar format
-            samples = pcm.samples
-            if isinstance(samples, np.ndarray):
-                if samples.ndim == 1:
-                    if pcm.channels > 1:
-                        cmaj = np.tile(samples, (pcm.channels, 1))
-                    else:
-                        cmaj = samples.reshape(1, -1)
-                elif samples.ndim == 2:
-                    ch = pcm.channels if pcm.channels else 1
-                    if samples.shape[0] == ch:
-                        cmaj = samples
-                    elif samples.shape[1] == ch:
-                        cmaj = samples.T
-                    else:
-                        if samples.shape[1] > samples.shape[0]:
-                            cmaj = samples
-                        else:
-                            cmaj = samples.T
-                cmaj = np.ascontiguousarray(cmaj)
-            else:
-                # Shouldn't happen, but handle it
-                cmaj = (
-                    samples.reshape(1, -1)
-                    if isinstance(samples, np.ndarray)
-                    else samples
-                )
-
-            in_layout = "mono" if pcm.channels == 1 else "stereo"
-            frame = av.AudioFrame.from_ndarray(cmaj, format="s16p", layout=in_layout)
-            frame.sample_rate = pcm.sample_rate
-
-            # Resample using persistent resampler
-            resampled_frames = resampler.resample(frame)
-
-            if resampled_frames:
-                resampled_frame = resampled_frames[0]
-                raw_array = resampled_frame.to_ndarray()
-                num_frames = resampled_frame.samples
-
-                # Handle PyAV's packed format quirk
-                ch = self._desired_channels
-                if raw_array.ndim == 2 and raw_array.shape[0] == 1 and ch > 1:
-                    flat = raw_array.reshape(-1)
-                    if len(flat) == num_frames * ch:
-                        resampled_samples = flat.reshape(-1, ch).T
-                    else:
-                        resampled_samples = flat.reshape(ch, -1)
-                elif raw_array.ndim == 2:
-                    if raw_array.shape[1] == ch:
-                        resampled_samples = raw_array.T
-                    elif raw_array.shape[0] == ch:
-                        resampled_samples = raw_array
-                    else:
-                        resampled_samples = raw_array.T
-                elif raw_array.ndim == 1:
-                    if ch == 1:
-                        resampled_samples = raw_array
-                    else:
-                        resampled_samples = np.tile(raw_array, (ch, 1))
-                else:
-                    resampled_samples = raw_array.reshape(ch, -1)
-
-                if resampled_samples.dtype != np.int16:
-                    resampled_samples = resampled_samples.astype(np.int16)
-
-                pcm_out = PcmData(
-                    samples=resampled_samples,
-                    sample_rate=self._desired_sample_rate,
-                    format="s16",
-                    channels=self._desired_channels,
-                )
-            else:
-                # Resampling failed, use original
-                pcm_out = pcm
+        # Resample using persistent resampler to avoid discontinuities between chunks
+        resampler = self._get_resampler(pcm.sample_rate, pcm.channels)
+        pcm_out = pcm.resample(
+            self._desired_sample_rate, self._desired_channels, resampler=resampler
+        )
 
         payload = pcm_out.to_bytes()
         # Metrics: counters per chunk