Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .cursorignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.env*
*.key
*.pem
*.p12
*.pfx
credentials.*
secrets/
config/database.yml
*.db
13 changes: 13 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
venv
__pycache__
.mypy_cache
.env
credentials.json
iclab-cumpa-71492aec6bff.json
text.wav
assets
*.db
*.whl
profile/
*.csv
.DS_Store
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,32 @@ python3 -m venv venv
source venv/bin/activate

# Install dependencies
pip install -r requirements.txt
pip install -r requirements.txt
```

## Additional Instructions for Raspberry Pi

On Raspberry Pi, dearpygui may not install directly via pip.

Comment the following requirement.txt line

```bash
# dearpygui==2.0.0
```

Instead, use the provided or downloaded .whl file:

Download dearpygui-1.11.1-cp311-cp311-linux_aarch64.whl from team's shared drive or PyPI.

Install it manually:

```bash
pip install dearpygui-1.11.1-cp311-cp311-linux_aarch64.whl
```

## For MAC

```bach
brew install libusb
pip install pyusb
```
Binary file added output.wav
Binary file not shown.
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@ pandas==2.2.3
openpyxl==3.1.5
pydantic==2.10.6
torch==2.7.0
transformers==4.52.3
transformers==4.52.3

pydub>=0.25.1
143 changes: 113 additions & 30 deletions src/audio/player.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
import urllib.request
import os
import io
import ssl
import pyaudio
import wave
import asyncio
import threading
from typing import Tuple, IO, TypedDict
from pydub import AudioSegment

from ..lib.time_stamp import get_current_timestamp
from ..lib.DB import addMessage
from ..message_event import MessageListener, MessageBroker, MessageType
from ..async_event import AsyncListener, AsyncBroker, AsyncMessageType
from ..lib.loggable import Loggable

import time
from ..lib.profiler import log_step
from ..lib.respeaker_tuning import get_index
from ..lib.audio_system import AudioSystem

class VoiceSettings(TypedDict):
speaker: str
volume: int
Expand Down Expand Up @@ -40,7 +50,7 @@ class ResponsePlayer(Loggable):

def __init__(self):
Loggable.__init__(self)
self.set_tag("response_player")
self.set_tag("🔊 response_player")

self.chat_done_flag = False

Expand All @@ -57,7 +67,8 @@ def __init__(self):
emotion=0,
emotion_strength=2,
format="wav",
sampling_rate=48000,
sampling_rate=16000,
bit_depth=24
)

# Ignore SSL certificate errors
Expand All @@ -68,11 +79,14 @@ def __init__(self):
self.pa = pyaudio.PyAudio()
self._stream = None

# respeaker setting
self._respeaker_index = AudioSystem().get_respeaker_index()

# Register event handlers
AsyncBroker().subscribe("wait_chat_finish", self._on_wait_chat_finish)
AsyncBroker().subscribe("chat_response", self._on_chat_response)
AsyncBroker().subscribe("wake_up", self._on_wake_up)

AsyncBroker().subscribe("wake_up_audio", self._on_wake_up_audio)
def _on_wait_chat_finish(self, msg: AsyncMessageType):
print("Chat finished, closing the stream.")
self.chat_done_flag = True
Expand All @@ -97,15 +111,19 @@ def map_emotion_to_value(self, emotion_label: str = None) -> int:
return emotion_value_map.get(emotion_label, 0)

async def _on_chat_response(self, response: dict):
emotion_label = response.get('emotion', "중립") # 기본값 중립
# emotion_label = response.get('emotion', "중립") # 기본값 중립

if response.get('type') in [None, "text"]:
# 감정 분석 결과 확인
clova_emotion = self.map_emotion_to_value(emotion_label)
self.log(f"Emotion label: {emotion_label}, emotion value: {clova_emotion}")
# clova_emotion = self.map_emotion_to_value(emotion_label)
# self.log(f"Emotion label: {emotion_label}, emotion value: {clova_emotion}")
# TTS 요청에서 emotion 값 설정
self._make_audio(response['msg'], emotion=clova_emotion)
await self._play_audio("text.wav")
# self._make_audio(response['msg'], emotion=clova_emotion)
self._make_audio(response['msg'])

await self._play_audio("text.wav", message=response['msg'])

# nothing is used for below (calling directly with music file name)
elif response.get('type') == "music-card":
music_name = response['msg']['src'] # e.g. "eno1.wav"
music_path = f"src/audio/assets/music/{music_name}" # e.g. "assets/music/eno1.wav"
Expand All @@ -122,37 +140,96 @@ async def _on_chat_response(self, response: dict):
self.log(f"Unknown response type {response['type']}")
AsyncBroker().emit(("play_response_end", None))

async def _play_audio(self, f: IO):
async def _play_audio(self, f: str, audio_cue: bool = False, message: str = None):
"""
play the audio file with {filename}
"""
try:
if self._stream is not None:
self.log(f"WARNING: Previous stream still exists: {self._stream.is_active()}")
if self._respeaker_index is not None:
self.log(f"ReSpeaker OUTPUT stream closed (previous)")
else:
self.log("Default speaker stream closed (previous)")
self._stream.close()
self._stream = None

wf = wave.open(f, 'rb')
self.log(f, wf.getframerate(), wf.getsampwidth(),
self.pa.get_format_from_width(wf.getsampwidth()))
self.log(f"Opening {f} {wf.getframerate()} {wf.getsampwidth()} {self.pa.get_format_from_width(wf.getsampwidth())}")

def callback(in_data, frame_count, time_info, status):
data = wf.readframes(frame_count)
if len(data) == 0 or wf.tell() == wf.getnframes():
wf.close()
# 스트림이 끝나면 이벤트 발행
print("self.chat_done_flag", self.chat_done_flag)
if self.chat_done_flag:
AsyncBroker().emit(("chat_done", None))
play_end_time = get_current_timestamp()
addMessage("CUMPAR", message, play_start_time, play_end_time)
if self._respeaker_index is not None:
self.log(f"ReSpeaker OUTPUT playback completed for file: {f}")
else:
AsyncBroker().emit(("chat_listening_start", None))
self.log(f"Default speaker playback completed for file: {f}")

self._stream_completed = True

def delayed_emit():
time.sleep(0.2)
if self.chat_done_flag:
AsyncBroker().emit(("chat_done", None))
else:
if not audio_cue:
AsyncBroker().emit(("chat_listening_start", None))
else:
pass

import threading
threading.Thread(target=delayed_emit, daemon=True).start()
return (data, pyaudio.paComplete)
return (data, pyaudio.paContinue)

self._stream = self.pa.open(format=self.pa.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
frames_per_buffer=2048,
stream_callback=callback)

if self._respeaker_index is not None:
self.log(f"ReSpeaker OUTPUT accessed at index {self._respeaker_index} for file: {f}")
else:
self.log(f"Default speaker accessed for file: {f}")

self._stream_completed = False

play_start_time = get_current_timestamp()
self._stream = self.pa.open(
format=self.pa.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
output_device_index=self._respeaker_index,
frames_per_buffer=2048,
stream_callback=callback
)

self.log("Audio stream opened, starting playback.")

while not self._stream_completed and self._stream.is_active():
await asyncio.sleep(0.1)

if self._stream is not None:
if self._stream.is_active():
self._stream.stop_stream()
self._stream.close()
self._stream = None
self.log("Audio stream explicitly closed and cleaned up.")

except (FileNotFoundError, wave.Error) as e:
self.log(f"Failed to open the audio file {f}: {e}")
self.log(f"Failed to open audio file {f}: {e}")
# 에러 시에도 이벤트 발행
AsyncBroker().emit(("play_response_end", None))
except Exception as e:
self.log(f"Audio playback error: {e}")
if hasattr(self, '_stream') and self._stream is not None:
try:
if self._stream.is_active():
self._stream.stop_stream()
self._stream.close()
self._stream = None
except Exception as cleanup_error:
self.log(f"Error during stream cleanup: {cleanup_error}")
AsyncBroker().emit(("play_response_end", None))

def _make_audio(self, text: str, emotion: int = 0, **settings: VoiceSettings) -> None:
"""
Expand All @@ -168,7 +245,7 @@ def _make_audio(self, text: str, emotion: int = 0, **settings: VoiceSettings) ->

# Construct query
s = { **self._default_voice_settings }
s["emotion"] = emotion # 감정 값 동적으로 설정
# s["emotion"] = emotion
s.update(settings)
self.log(f"_make_audio emotion: {emotion}")

Expand All @@ -179,9 +256,9 @@ def _make_audio(self, text: str, emotion: int = 0, **settings: VoiceSettings) ->
f"&emotion={s['emotion']}" + \
f"&emotion-strength ={s['emotion_strength']}" + \
"&format=wav" + \
"&sampling-rate=48000" + \
"&sampling-rate=16000" + \
f"&text={urllib.parse.quote(text)}"

start_time = time.time()
try:
# Make the request to the API
response = urllib.request.urlopen(request, data=query.encode('utf-8'))
Expand All @@ -192,6 +269,8 @@ def _make_audio(self, text: str, emotion: int = 0, **settings: VoiceSettings) ->
response_body = response.read()
with open('text.wav', 'wb') as f:
f.write(response_body)
end_time = time.time()
log_step(text, "clova_tts", start_time, end_time)
return
else:
self.log(f"Failed to synthesize speech. HTTP response code: {rescode}")
Expand All @@ -213,12 +292,16 @@ def _make_audio(self, text: str, emotion: int = 0, **settings: VoiceSettings) ->
# TODO: write empty audio to text.wav
return

async def _on_wake_up(self, _: tuple[str, None]):
def _on_wake_up(self, _: tuple[str, None]):
self.log("Wake Up")
self.chat_done_flag = False
# sound_path = f"src/audio/assets/music/ding.wav"
# await self._play_audio(sound_path)

sound_path = f"src/audio/assets/music/ding.wav"
await self._play_audio(sound_path)
async def _on_wake_up_audio(self, _: tuple[str, None]):
self.log("Wake Up Audio")
sound_path = f"src/audio/assets/music/ding.wav"
await self._play_audio(sound_path, True) # audio_cue=True (since this is only the audio-cue)

def stop(self):
self.pa.terminate()
Loading