Skip to content

Commit 6eb8270

Browse files
committed
fix 48khz support
1 parent 3b90548 commit 6eb8270

File tree

3 files changed

+74
-3
lines changed

3 files changed

+74
-3
lines changed

conftest.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,57 @@ def mia_audio_16khz():
129129
return pcm
130130

131131

132+
@pytest.fixture
133+
def mia_audio_48khz():
134+
"""Load mia.mp3 and convert to 48kHz PCM data."""
135+
audio_file_path = os.path.join(get_assets_dir(), "mia.mp3")
136+
137+
# Load audio file using PyAV
138+
container = av.open(audio_file_path)
139+
audio_stream = container.streams.audio[0]
140+
original_sample_rate = audio_stream.sample_rate
141+
target_rate = 48000
142+
143+
# Create resampler if needed
144+
resampler = None
145+
if original_sample_rate != target_rate:
146+
resampler = av.AudioResampler(
147+
format='s16',
148+
layout='mono',
149+
rate=target_rate
150+
)
151+
152+
# Read all audio frames
153+
samples = []
154+
for frame in container.decode(audio_stream):
155+
# Resample if needed
156+
if resampler:
157+
frame = resampler.resample(frame)[0]
158+
159+
# Convert to numpy array
160+
frame_array = frame.to_ndarray()
161+
if len(frame_array.shape) > 1:
162+
# Convert stereo to mono
163+
frame_array = np.mean(frame_array, axis=0)
164+
samples.append(frame_array)
165+
166+
# Concatenate all samples
167+
samples = np.concatenate(samples)
168+
169+
# Convert to int16
170+
samples = samples.astype(np.int16)
171+
container.close()
172+
173+
# Create PCM data
174+
pcm = PcmData(
175+
samples=samples,
176+
sample_rate=target_rate,
177+
format="s16"
178+
)
179+
180+
return pcm
181+
182+
132183
@pytest.fixture
133184
def golf_swing_image():
134185
"""Load golf_swing.png image and return as bytes."""

plugins/fish/tests/test_fish_stt.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,20 @@ async def test_transcribe_mia_audio(self, stt, mia_audio_16khz):
3535
assert len(session.transcripts) > 0, "Expected at least one transcript"
3636
transcript_event = session.transcripts[0]
3737
assert "forgotten treasures" in transcript_event.text.lower()
38+
39+
@pytest.mark.integration
40+
async def test_transcribe_mia_audio_48khz(self, stt, mia_audio_48khz):
41+
# Create session to collect transcripts and errors
42+
session = STTSession(stt)
43+
44+
# Process the audio
45+
await stt.process_audio(mia_audio_48khz)
46+
47+
# Wait for result
48+
await session.wait_for_result(timeout=30.0)
49+
assert not session.errors
50+
51+
# Verify transcript
52+
assert len(session.transcripts) > 0, "Expected at least one transcript"
53+
transcript_event = session.transcripts[0]
54+
assert "forgotten treasures" in transcript_event.text.lower()

plugins/fish/vision_agents/plugins/fish/stt.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,12 @@ def _pcm_to_wav_bytes(self, pcm_data: PcmData) -> bytes:
8383
"""
8484
wav_buffer = io.BytesIO()
8585

86+
# TODO: we should resample here
87+
8688
with wave.open(wav_buffer, "wb") as wav_file:
8789
wav_file.setnchannels(1) # Mono
8890
wav_file.setsampwidth(2) # 16-bit
89-
wav_file.setframerate(self.sample_rate)
91+
wav_file.setframerate(pcm_data.sample_rate)
9092

9193
# Convert numpy array to bytes if needed
9294
if isinstance(pcm_data.samples, np.ndarray):
@@ -119,6 +121,7 @@ async def _process_audio_impl(
119121
logger.warning("Fish Audio STT is closed, ignoring audio")
120122
return None
121123

124+
122125
# Store the current user context
123126
self._current_user = user_metadata
124127

@@ -134,8 +137,8 @@ async def _process_audio_impl(
134137

135138
try:
136139
# Convert PCM to WAV format
137-
logger.debug(
138-
"Converting PCM to WAV",
140+
logger.info(
141+
"Converting PCM %s to WAV ", pcm_data.sample_rate,
139142
extra={"sample_rate": self.sample_rate},
140143
)
141144
wav_data = self._pcm_to_wav_bytes(pcm_data)

0 commit comments

Comments
 (0)