fix 48khz support

tschellenbach · tschellenbach · commit 6eb827091970 · 2025-10-23T11:23:00.000-06:00
diff --git a/conftest.py b/conftest.py
@@ -129,6 +129,57 @@ def mia_audio_16khz():
     return pcm
 
 
+@pytest.fixture
+def mia_audio_48khz():
+    """Load mia.mp3 and convert to 48kHz PCM data."""
+    audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
+    
+    # Load audio file using PyAV
+    container = av.open(audio_file_path)
+    audio_stream = container.streams.audio[0]
+    original_sample_rate = audio_stream.sample_rate
+    target_rate = 48000
+
+    # Create resampler if needed
+    resampler = None
+    if original_sample_rate != target_rate:
+        resampler = av.AudioResampler(
+            format='s16',
+            layout='mono',
+            rate=target_rate
+        )
+
+    # Read all audio frames
+    samples = []
+    for frame in container.decode(audio_stream):
+        # Resample if needed
+        if resampler:
+            frame = resampler.resample(frame)[0]
+
+        # Convert to numpy array
+        frame_array = frame.to_ndarray()
+        if len(frame_array.shape) > 1:
+            # Convert stereo to mono
+            frame_array = np.mean(frame_array, axis=0)
+        samples.append(frame_array)
+
+    # Concatenate all samples
+    samples = np.concatenate(samples)
+
+    # Convert to int16
+    samples = samples.astype(np.int16)
+    container.close()
+
+    # Create PCM data
+    pcm = PcmData(
+        samples=samples,
+        sample_rate=target_rate,
+        format="s16"
+    )
+
+    return pcm
+
+
 @pytest.fixture
 def golf_swing_image():
     """Load golf_swing.png image and return as bytes."""
diff --git a/plugins/fish/tests/test_fish_stt.py b/plugins/fish/tests/test_fish_stt.py
@@ -35,3 +35,20 @@ async def test_transcribe_mia_audio(self, stt, mia_audio_16khz):
         assert len(session.transcripts) > 0, "Expected at least one transcript"
         transcript_event = session.transcripts[0]
         assert "forgotten treasures" in transcript_event.text.lower()
+
+    @pytest.mark.integration
+    async def test_transcribe_mia_audio_48khz(self, stt, mia_audio_48khz):
+        # Create session to collect transcripts and errors
+        session = STTSession(stt)
+        
+        # Process the audio
+        await stt.process_audio(mia_audio_48khz)
+        
+        # Wait for result
+        await session.wait_for_result(timeout=30.0)
+        assert not session.errors
+        
+        # Verify transcript
+        assert len(session.transcripts) > 0, "Expected at least one transcript"
+        transcript_event = session.transcripts[0]
+        assert "forgotten treasures" in transcript_event.text.lower()
diff --git a/plugins/fish/vision_agents/plugins/fish/stt.py b/plugins/fish/vision_agents/plugins/fish/stt.py
@@ -83,10 +83,12 @@ def _pcm_to_wav_bytes(self, pcm_data: PcmData) -> bytes:
         """
         wav_buffer = io.BytesIO()
 
+        # TODO: we should resample here
+
         with wave.open(wav_buffer, "wb") as wav_file:
             wav_file.setnchannels(1)  # Mono
             wav_file.setsampwidth(2)  # 16-bit
-            wav_file.setframerate(self.sample_rate)
+            wav_file.setframerate(pcm_data.sample_rate)
             
             # Convert numpy array to bytes if needed
             if isinstance(pcm_data.samples, np.ndarray):
@@ -119,6 +121,7 @@ async def _process_audio_impl(
             logger.warning("Fish Audio STT is closed, ignoring audio")
             return None
 
+
         # Store the current user context
         self._current_user = user_metadata
 
@@ -134,8 +137,8 @@ async def _process_audio_impl(
 
         try:
             # Convert PCM to WAV format
-            logger.debug(
-                "Converting PCM to WAV",
+            logger.info(
+                "Converting PCM  %s to WAV ", pcm_data.sample_rate,
                 extra={"sample_rate": self.sample_rate},
             )
             wav_data = self._pcm_to_wav_bytes(pcm_data)