diff --git a/backend/main.py b/backend/main.py index 4e9facb551..6705754f68 100644 --- a/backend/main.py +++ b/backend/main.py @@ -3,7 +3,6 @@ import firebase_admin from fastapi import FastAPI -from fastapi_utilities import repeat_at from modal import Image, App, asgi_app, Secret, Cron from routers import workflow, chat, firmware, screenpipe, plugins, memories, transcribe, notifications, speech_profile, \ @@ -65,6 +64,6 @@ def api(): os.makedirs(path) -@modal_app.function(image=image,schedule=Cron('* * * * *')) +@modal_app.function(image=image, schedule=Cron('* * * * *')) async def start_job(): await start_cron_job() diff --git a/backend/modal/speech_profile_modal.py b/backend/modal/speech_profile_modal.py index 698422f8a7..412eb2e711 100644 --- a/backend/modal/speech_profile_modal.py +++ b/backend/modal/speech_profile_modal.py @@ -1,8 +1,8 @@ import json import os -import uuid from typing import List +import modal.gpu import torch from fastapi import File, UploadFile, Form from modal import App, web_endpoint, Secret, Image @@ -16,10 +16,6 @@ class TranscriptSegment(BaseModel): end: float -class ResponseModel(BaseModel): - matches: List[bool] - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = SpeakerRecognition.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", @@ -33,14 +29,14 @@ def sample_same_speaker_as_segment(sample_audio: str, segment: str) -> bool: score, prediction = model.verify_files(sample_audio, segment) print(score, prediction) # return bool(score[0] > 0.6) - return prediction[0] + return bool(prediction[0]) except Exception as e: + print(e) return False def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegment], profile_path: str): print('classify_segments') - # TODO: for better performance probably use segments before merging them together matches = [False] * len(transcript_segments) if not profile_path: return matches @@ -48,7 +44,6 @@ def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegme for i, segment in enumerate(transcript_segments): file_name = os.path.basename(audio_file) temporal_file = f"_temp/{file_name}_{segment.start}_{segment.end}.wav" - # temporal_file = f"_temp/{i}.wav" AudioSegment.from_wav(audio_file)[segment.start * 1000:segment.end * 1000].export(temporal_file, format="wav") is_user = sample_same_speaker_as_segment(temporal_file, profile_path) @@ -56,14 +51,13 @@ def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegme matches[i] = is_user os.remove(temporal_file) - # temporal_file = f'_temp/{i}-{is_user}.wav' - # AudioSegment.from_wav(audio_file)[segment.start * 1000:segment.end * 1000].export(temporal_file, format="wav") return matches app = App(name='speech_profile') image = ( Image.debian_slim() + .apt_install('ffmpeg') .pip_install("torch") .pip_install("torchaudio") .pip_install("speechbrain") @@ -75,23 +69,25 @@ def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegme @app.function( image=image, - keep_warm=0, + keep_warm=1, memory=(1024, 2048), + allow_concurrent_inputs=4, cpu=4, + gpu=modal.gpu.T4(count=1), secrets=[Secret.from_name('huggingface-token')], ) @web_endpoint(method='POST') -async def upload_files_and_segments( +async def endpoint( profile_path: UploadFile = File(...), audio_file: UploadFile = File(...), segments: str = Form(...) -) -> ResponseModel: - uid = uuid.uuid4() - profile_file_path = f"_temp/{uid}_{profile_path.filename}" +) -> List[bool]: + profile_file_path = profile_path.filename + with open(profile_file_path, 'wb') as f: f.write(profile_path.file.read()) - audio_file_path = f"_temp/{uid}_{audio_file.filename}" + audio_file_path = audio_file.filename with open(audio_file_path, 'wb') as f: f.write(audio_file.file.read()) @@ -99,7 +95,6 @@ async def upload_files_and_segments( transcript_segments = [TranscriptSegment(**segment) for segment in segments_data] try: - # Call the classify_segments function with the file paths result = classify_segments(audio_file_path, transcript_segments, profile_file_path) return result finally: diff --git a/backend/routers/memories.py b/backend/routers/memories.py index 36c383c81f..b1b030317d 100644 --- a/backend/routers/memories.py +++ b/backend/routers/memories.py @@ -8,7 +8,6 @@ import database.memories as memories_db from database.vector_db import delete_vector from models.memory import * -from utils._deprecated.speaker_profile import classify_segments from utils.memories.location import get_google_maps_location from utils.memories.process_memory import process_memory, process_user_emotion from utils.other import endpoints as auth @@ -16,6 +15,7 @@ delete_postprocessing_audio, get_profile_audio_if_exists from utils.plugins import trigger_external_integrations from utils.stt.pre_recorded import fal_whisperx, fal_postprocessing +from utils.stt.speech_profile import get_speech_profile_matching_predictions from utils.stt.vad import vad_is_empty router = APIRouter() @@ -78,7 +78,6 @@ def postprocess_memory( TODO: Try Nvidia Nemo ASR as suggested by @jhonnycombs https://huggingface.co/spaces/hf-audio/open_asr_leaderboard TODO: USE soniox here? with speech profile and stuff? - TODO: either do speech profile embeddings or use the profile audio as prefix TODO: should consider storing non beautified segments, and beautify on read? """ memory_data = _get_memory_by_id(uid, memory_id) @@ -110,11 +109,10 @@ def postprocess_memory( try: aseg = AudioSegment.from_wav(file_path) profile_duration = 0 - profile_path = get_profile_audio_if_exists(uid) if aseg.frame_rate == 16000 else None signed_url = upload_postprocessing_audio(file_path) - # Ensure delete uploaded file in 15m + # Ensure delete uploaded file in 5m threads = threading.Thread(target=_delete_postprocessing_audio, args=(file_path,)) threads.start() @@ -128,10 +126,6 @@ def postprocess_memory( memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled) raise HTTPException(status_code=500, detail="FAL WhisperX failed to process audio") - matches = classify_segments(file_path, segments, profile_path) - for i, segment in enumerate(segments): - segment.is_user = matches[i] - # if new transcript is 90% shorter than the original, cancel post-processing, smth wrong with audio or FAL count = len(''.join([segment.text.strip() for segment in memory.transcript_segments])) new_count = len(''.join([segment.text.strip() for segment in segments])) @@ -140,6 +134,11 @@ def postprocess_memory( memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled) raise HTTPException(status_code=500, detail="Post-processed transcript is too short") + profile_path = get_profile_audio_if_exists(uid) if aseg.frame_rate == 16000 else None + matches = get_speech_profile_matching_predictions(file_path, profile_path, [s.dict() for s in segments]) + for i, segment in enumerate(segments): + segment.is_user = matches[i] + # TODO: post llm process here would be great, sometimes whisper x outputs without punctuation # Store previous and new segments in DB as collection. memories_db.store_model_segments_result(uid, memory.id, 'deepgram_streaming', memory.transcript_segments) @@ -162,17 +161,8 @@ def postprocess_memory( return result -# audio_path = '_temp/f39a99f8-f90c-4a04-800f-4b99a85d4e79_recording-20240824_210026.wav' -# # signed_url = upload_postprocessing_audio(audio_path) -# # words = fal_whisperx(signed_url, 2, ) -# words = [{'timestamp': [0.0, 1.14], 'text': ' Hey,', 'speaker': 'SPEAKER_01'}, {'timestamp': [1.14, 1.64], 'text': ' Ggpt,', 'speaker': 'SPEAKER_01'}, {'timestamp': [1.64, 1.84], 'text': " how's", 'speaker': 'SPEAKER_01'}, {'timestamp': [1.84, 1.86], 'text': ' it', 'speaker': 'SPEAKER_01'}, {'timestamp': [1.86, 4.42], 'text': ' going?', 'speaker': 'SPEAKER_01'}, {'timestamp': [4.42, 5.66], 'text': ' That', 'speaker': 'SPEAKER_00'}, {'timestamp': [5.66, 5.98], 'text': ' sounds', 'speaker': 'SPEAKER_00'}, {'timestamp': [5.98, 6.2], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [6.2, 6.32], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [6.32, 6.62], 'text': ' unique', 'speaker': 'SPEAKER_00'}, {'timestamp': [6.62, 7.04], 'text': ' way', 'speaker': 'SPEAKER_00'}, {'timestamp': [7.04, 7.26], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [7.26, 7.58], 'text': ' describe', 'speaker': 'SPEAKER_00'}, {'timestamp': [7.58, 8.16], 'text': ' someone.', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.16, 8.54], 'text': ' What', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.54, 8.68], 'text': ' do', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.68, 8.76], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.76, 8.94], 'text': ' mean', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.94, 9.1], 'text': ' by', 'speaker': 'SPEAKER_00'}, {'timestamp': [9.1, 9.42], 'text': ' cozy', 'speaker': 'SPEAKER_00'}, {'timestamp': [9.42, 10.56], 'text': ' gun?', 'speaker': 'SPEAKER_00'}, {'timestamp': [10.56, 11.38], 'text': ' No,', 'speaker': 'SPEAKER_01'}, {'timestamp': [11.38, 11.42], 'text': ' I', 'speaker': 'SPEAKER_01'}, {'timestamp': [11.42, 11.68], 'text': ' said', 'speaker': 'SPEAKER_01'}, {'timestamp': [11.68, 12.24], 'text': ' Chat', 'speaker': 'SPEAKER_01'}, {'timestamp': [12.24, 13.3], 'text': ' Gpt,', 'speaker': 'SPEAKER_01'}, {'timestamp': [13.3, 13.74], 'text': ' your', 'speaker': 'SPEAKER_01'}, {'timestamp': [13.74, 14.28], 'text': ' name.', 'speaker': 'SPEAKER_01'}, {'timestamp': [14.28, 17.84], 'text': ' I', 'speaker': 'SPEAKER_00'}, {'timestamp': [17.84, 18.04], 'text': ' got', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.04, 18.24], 'text': ' it.', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.24, 18.56], 'text': ' So', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.56, 18.8], 'text': ' Chat', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.8, 19.16], 'text': ' Gpt,', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.16, 19.42], 'text': " she's", 'speaker': 'SPEAKER_00'}, {'timestamp': [19.42, 19.44], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.44, 19.68], 'text': ' petite', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.68, 19.94], 'text': ' cozy', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.94, 20.36], 'text': ' gun.', 'speaker': 'SPEAKER_00'}, {'timestamp': [20.36, 20.86], 'text': " That's", 'speaker': 'SPEAKER_00'}, {'timestamp': [20.86, 21.08], 'text': ' quite', 'speaker': 'SPEAKER_00'}, {'timestamp': [21.08, 21.24], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [21.24, 21.84], 'text': ' metaphorical', 'speaker': 'SPEAKER_00'}, {'timestamp': [21.84, 22.34], 'text': ' description.', 'speaker': 'SPEAKER_00'}, {'timestamp': [22.34, 22.7], 'text': ' Are', 'speaker': 'SPEAKER_00'}, {'timestamp': [22.7, 22.88], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [22.88, 23.26], 'text': ' referring', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.26, 23.42], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.42, 23.64], 'text': ' me', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.64, 23.8], 'text': ' as', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.8, 24.04], 'text': ' being', 'speaker': 'SPEAKER_00'}, {'timestamp': [24.04, 24.36], 'text': ' small', 'speaker': 'SPEAKER_00'}, {'timestamp': [24.36, 24.68], 'text': ' but', 'speaker': 'SPEAKER_00'}, {'timestamp': [24.68, 25.06], 'text': ' impactful', 'speaker': 'SPEAKER_00'}, {'timestamp': [25.06, 25.52], 'text': ' or', 'speaker': 'SPEAKER_00'}, {'timestamp': [25.52, 26.32], 'text': ' comforting?', 'speaker': 'SPEAKER_00'}, {'timestamp': [26.32, 28.12], 'text': ' Yeah,', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.3, 41.3], 'text': ' exactly', 'speaker': None}, {'timestamp': [41.3, 41.3], 'text': ' that.', 'speaker': None}, {'timestamp': [41.3, 41.3], 'text': ' Can', 'speaker': None}, {'timestamp': [41.3, 41.52], 'text': ' you', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.52, 41.66], 'text': ' tell', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.66, 41.78], 'text': ' me', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.78, 41.9], 'text': ' a', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.9, 42.16], 'text': ' story', 'speaker': 'SPEAKER_01'}, {'timestamp': [42.16, 42.34], 'text': ' or', 'speaker': 'SPEAKER_01'}, {'timestamp': [69.98, 69.98], 'text': ' something', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' fun', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' that', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' learned', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' recently?', 'speaker': 'SPEAKER_00'}, {'timestamp': [70.08, 70.34], 'text': ' while', 'speaker': 'SPEAKER_00'}, {'timestamp': [70.34, 70.58], 'text': ' hunting', 'speaker': 'SPEAKER_00'}, {'timestamp': [70.58, 70.9], 'text': ' together', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.28, 71.56], 'text': " it's", 'speaker': 'SPEAKER_00'}, {'timestamp': [71.56, 71.7], 'text': ' kind', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.7, 71.82], 'text': ' of', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.82, 71.96], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.96, 72.12], 'text': ' an', 'speaker': 'SPEAKER_00'}, {'timestamp': [72.12, 72.42], 'text': ' octopus', 'speaker': 'SPEAKER_00'}, {'timestamp': [72.42, 72.84], 'text': ' saying', 'speaker': 'SPEAKER_00'}, {'timestamp': [72.84, 73.32], 'text': ' hey', 'speaker': 'SPEAKER_00'}, {'timestamp': [73.32, 73.58], 'text': ' back', 'speaker': 'SPEAKER_00'}, {'timestamp': [73.58, 73.88], 'text': ' off', 'speaker': 'SPEAKER_00'}, {'timestamp': [73.88, 74.62], 'text': ' but', 'speaker': 'SPEAKER_00'}, {'timestamp': [74.62, 74.78], 'text': ' with', 'speaker': 'SPEAKER_00'}, {'timestamp': [74.78, 74.9], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [74.9, 75.06], 'text': ' little', 'speaker': 'SPEAKER_00'}, {'timestamp': [75.06, 75.24], 'text': ' more', 'speaker': 'SPEAKER_00'}, {'timestamp': [75.24, 75.62], 'text': ' force', 'speaker': 'SPEAKER_00'}, {'timestamp': [75.62, 76.48], 'text': ' this', 'speaker': 'SPEAKER_00'}, {'timestamp': [76.48, 76.82], 'text': ' playful', 'speaker': 'SPEAKER_00'}, {'timestamp': [76.82, 77.16], 'text': ' behavior', 'speaker': 'SPEAKER_00'}, {'timestamp': [77.5, 77.62], 'text': ' shows', 'speaker': 'SPEAKER_00'}, {'timestamp': [77.62, 77.94], 'text': ' just', 'speaker': 'SPEAKER_00'}, {'timestamp': [77.94, 78.24], 'text': ' how', 'speaker': 'SPEAKER_00'}, {'timestamp': [78.24, 78.8], 'text': ' intelligent', 'speaker': 'SPEAKER_00'}, {'timestamp': [78.8, 79.3], 'text': ' and', 'speaker': 'SPEAKER_00'}, {'timestamp': [79.3, 79.6], 'text': ' curious', 'speaker': 'SPEAKER_00'}, {'timestamp': [79.6, 80.4], 'text': ' octopuses', 'speaker': 'SPEAKER_00'}, {'timestamp': [80.4, 80.7], 'text': ' are', 'speaker': 'SPEAKER_00'}, {'timestamp': [80.7, 81.2], 'text': ' what', 'speaker': 'SPEAKER_00'}, {'timestamp': [81.34, 81.66], 'text': ' you?', 'speaker': 'SPEAKER_00'}, {'timestamp': [81.66, 82.22], 'text': ' Learned', 'speaker': 'SPEAKER_00'}, {'timestamp': [82.22, 82.36], 'text': ' anything', 'speaker': 'SPEAKER_00'}, {'timestamp': [82.36, 82.68], 'text': ' fun', 'speaker': 'SPEAKER_00'}, {'timestamp': [82.68, 83.44], 'text': ' lately?', 'speaker': 'SPEAKER_00'}, {'timestamp': [83.44, 85.84], 'text': ' Yeah,', 'speaker': 'SPEAKER_01'}, {'timestamp': [85.84, 86.2], 'text': ' about...', 'speaker': 'SPEAKER_01'}, {'timestamp': [86.2, 86.88], 'text': ' a', 'speaker': 'SPEAKER_01'}, {'timestamp': [86.88, 87.1], 'text': ' little', 'speaker': 'SPEAKER_01'}, {'timestamp': [87.1, 87.28], 'text': ' bit', 'speaker': 'SPEAKER_01'}, {'timestamp': [87.28, 87.84], 'text': ' about...', 'speaker': 'SPEAKER_01'}, {'timestamp': [87.84, 90.42], 'text': ' embeddings.', 'speaker': 'SPEAKER_01'}, {'timestamp': [90.42, 92.44], 'text': ' Sounds', 'speaker': 'SPEAKER_00'}, {'timestamp': [92.44, 92.72], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [92.72, 92.9], 'text': " you're", 'speaker': 'SPEAKER_00'}, {'timestamp': [92.9, 93.04], 'text': ' on', 'speaker': 'SPEAKER_00'}, {'timestamp': [93.04, 93.14], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [93.14, 93.66], 'text': ' something.', 'speaker': 'SPEAKER_00'}, {'timestamp': [93.66, 94.3], 'text': ' What', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.3, 94.44], 'text': ' did', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.44, 94.58], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.58, 94.78], 'text': ' learn', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.78, 94.9], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.9, 95.0], 'text': ' little', 'speaker': 'SPEAKER_00'}, {'timestamp': [95.0, 95.18], 'text': ' bit', 'speaker': 'SPEAKER_00'}, {'timestamp': [95.18, 95.66], 'text': ' about?', 'speaker': 'SPEAKER_00'}, {'timestamp': [95.66, 96.32], 'text': " I'm", 'speaker': 'SPEAKER_00'}, {'timestamp': [96.32, 96.54], 'text': ' curious', 'speaker': 'SPEAKER_00'}, {'timestamp': [96.54, 96.76], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [96.76, 97.26], 'text': ' hear.', 'speaker': 'SPEAKER_00'}, {'timestamp': [97.26, 100.7], 'text': ' Embeddings', 'speaker': 'SPEAKER_01'}, {'timestamp': [100.7, 100.9], 'text': ' and', 'speaker': 'SPEAKER_01'}, {'timestamp': [100.9, 101.18], 'text': ' wide', 'speaker': 'SPEAKER_01'}, {'timestamp': [101.18, 101.86], 'text': ' vision', 'speaker': 'SPEAKER_01'}, {'timestamp': [101.86, 102.28], 'text': ' models', 'speaker': 'SPEAKER_01'}, {'timestamp': [102.28, 102.84], 'text': ' like', 'speaker': 'SPEAKER_01'}, {'timestamp': [102.84, 103.96], 'text': ' GPT', 'speaker': 'SPEAKER_01'}, {'timestamp': [103.96, 104.24], 'text': '-4', 'speaker': 'SPEAKER_01'}, {'timestamp': [104.24, 104.72], 'text': ' vision', 'speaker': 'SPEAKER_01'}, {'timestamp': [104.72, 106.24], 'text': ' do', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.24, 106.4], 'text': ' not', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.4, 106.7], 'text': ' work', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.7, 106.84], 'text': ' as', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.84, 107.08], 'text': ' ideally', 'speaker': 'SPEAKER_01'}, {'timestamp': [107.08, 107.38], 'text': ' and', 'speaker': 'SPEAKER_01'}, {'timestamp': [107.38, 107.54], 'text': " it's", 'speaker': 'SPEAKER_01'}, {'timestamp': [107.54, 107.72], 'text': ' like', 'speaker': 'SPEAKER_01'}, {'timestamp': [107.72, 108.28], 'text': ' shitting.', 'speaker': 'SPEAKER_01'}, {'timestamp': [108.28, 112.84], 'text': ' It', 'speaker': 'SPEAKER_00'}, {'timestamp': [112.84, 113.16], 'text': ' sounds', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.16, 113.36], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.36, 113.56], 'text': " you've", 'speaker': 'SPEAKER_00'}, {'timestamp': [113.56, 113.68], 'text': ' been', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.68, 113.96], 'text': ' diving', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.96, 114.28], 'text': ' into', 'speaker': 'SPEAKER_00'}, {'timestamp': [114.28, 114.52], 'text': ' some', 'speaker': 'SPEAKER_00'}, {'timestamp': [114.52, 114.84], 'text': ' deeper', 'speaker': 'SPEAKER_00'}, {'timestamp': [114.84, 115.3], 'text': ' AI...', 'speaker': 'SPEAKER_00'}] -# segments = fal_postprocessing(words, 0, 0) -# print(segments) -# classify_segments(audio_path, segments, '_temp/caLCFj7IisV85UX9XrrV1aVf3pk1_speech_profile.wav') - - def _delete_postprocessing_audio(file_path): - time.sleep(900) # 15m + time.sleep(300) # 5 min delete_postprocessing_audio(file_path) diff --git a/backend/utils/_deprecated/preprocess.py b/backend/utils/@deprecated/preprocess.py similarity index 100% rename from backend/utils/_deprecated/preprocess.py rename to backend/utils/@deprecated/preprocess.py diff --git a/backend/utils/_deprecated/soniox_util.py b/backend/utils/@deprecated/soniox_util.py similarity index 100% rename from backend/utils/_deprecated/soniox_util.py rename to backend/utils/@deprecated/soniox_util.py diff --git a/backend/utils/_deprecated/speaker_profile.py b/backend/utils/@deprecated/speech_profile.py similarity index 78% rename from backend/utils/_deprecated/speaker_profile.py rename to backend/utils/@deprecated/speech_profile.py index 2de90106ff..276705bfe6 100644 --- a/backend/utils/_deprecated/speaker_profile.py +++ b/backend/utils/@deprecated/speech_profile.py @@ -7,54 +7,6 @@ # from pydub import AudioSegment # device = "cuda" if torch.cuda.is_available() else "cpu" # device = 'cpu' -import os -from typing import List - -import torch -from pydub import AudioSegment -from speechbrain.inference.speaker import SpeakerRecognition - -from models.transcript_segment import TranscriptSegment - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -model = SpeakerRecognition.from_hparams( - source="speechbrain/spkrec-ecapa-voxceleb", - savedir="pretrained_models/spkrec-ecapa-voxceleb", - run_opts={"device": device}, -) - - -def sample_same_speaker_as_segment(sample_audio: str, segment: str) -> bool: - try: - score, prediction = model.verify_files(sample_audio, segment) - print(score, prediction) - # return bool(score[0] > 0.6) - return prediction[0] - except Exception as e: - return False - - -def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegment], profile_path: str): - print('classify_segments') - # TODO: for better performance probably use segments before merging them together - matches = [False] * len(transcript_segments) - if not profile_path: - return matches - - for i, segment in enumerate(transcript_segments): - file_name = os.path.basename(audio_file) - temporal_file = f"_temp/{file_name}_{segment.start}_{segment.end}.wav" - # temporal_file = f"_temp/{i}.wav" - AudioSegment.from_wav(audio_file)[segment.start * 1000:segment.end * 1000].export(temporal_file, format="wav") - - is_user = sample_same_speaker_as_segment(temporal_file, profile_path) - print('Matches', is_user, temporal_file) - matches[i] = is_user - - os.remove(temporal_file) - # temporal_file = f'_temp/{i}-{is_user}.wav' - # AudioSegment.from_wav(audio_file)[segment.start * 1000:segment.end * 1000].export(temporal_file, format="wav") - return matches # def get_speaker_embedding(audio_path): # # print('get_speaker_embedding', audio_path) diff --git a/backend/utils/_deprecated/whisper.py b/backend/utils/@deprecated/whisper.py similarity index 100% rename from backend/utils/_deprecated/whisper.py rename to backend/utils/@deprecated/whisper.py diff --git a/backend/utils/_deprecated/whisper_x.py b/backend/utils/@deprecated/whisper_x.py similarity index 100% rename from backend/utils/_deprecated/whisper_x.py rename to backend/utils/@deprecated/whisper_x.py diff --git a/backend/utils/other/storage.py b/backend/utils/other/storage.py index b7295b2cfe..2468e9ef25 100644 --- a/backend/utils/other/storage.py +++ b/backend/utils/other/storage.py @@ -1,5 +1,5 @@ -import json import datetime +import json import os from google.cloud import storage @@ -50,6 +50,7 @@ def delete_postprocessing_audio(file_path: str): blob = bucket.blob(file_path) blob.delete() + def create_signed_postprocessing_audio_url(file_path: str): bucket = storage_client.bucket(postprocessing_audio_bucket) blob = bucket.blob(file_path) diff --git a/backend/utils/stt/speech_profile.py b/backend/utils/stt/speech_profile.py new file mode 100644 index 0000000000..a5067d55ad --- /dev/null +++ b/backend/utils/stt/speech_profile.py @@ -0,0 +1,25 @@ +import json +import os +from typing import List + +import requests + + +def get_speech_profile_matching_predictions(audio_file_path: str, profile_path: str, segments: List) -> List[bool]: + print('get_speech_profile_matching_predictions') + files = [ + ('audio_file', (os.path.basename(audio_file_path), open(audio_file_path, 'rb'), 'audio/wav')), + ('profile_path', (os.path.basename(profile_path), open(profile_path, 'rb'), 'audio/wav')) + ] + response = requests.post( + os.getenv('HOSTED_SPEECH_PROFILE_API_URL'), + data={'segments': json.dumps(segments)}, + files=files + ) + try: + result = response.json() + print('get_speech_profile_matching_predictions', result) + return result + except Exception as e: + print('get_speech_profile_matching_predictions', str(e)) + return [False] * len(segments)