Skip to content

Commit

Permalink
speech profile request to separate deployment embedded into post-proc…
Browse files Browse the repository at this point in the history
…essing request
  • Loading branch information
josancamon19 committed Aug 25, 2024
1 parent 0c35c75 commit aed4865
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 86 deletions.
3 changes: 1 addition & 2 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import firebase_admin
from fastapi import FastAPI
from fastapi_utilities import repeat_at

from modal import Image, App, asgi_app, Secret, Cron
from routers import workflow, chat, firmware, screenpipe, plugins, memories, transcribe, notifications, speech_profile, \
Expand Down Expand Up @@ -65,6 +64,6 @@ def api():
os.makedirs(path)


@modal_app.function(image=image,schedule=Cron('* * * * *'))
@modal_app.function(image=image, schedule=Cron('* * * * *'))
async def start_job():
await start_cron_job()
29 changes: 12 additions & 17 deletions backend/modal/speech_profile_modal.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import os
import uuid
from typing import List

import modal.gpu
import torch
from fastapi import File, UploadFile, Form
from modal import App, web_endpoint, Secret, Image
Expand All @@ -16,10 +16,6 @@ class TranscriptSegment(BaseModel):
end: float


class ResponseModel(BaseModel):
matches: List[bool]


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SpeakerRecognition.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
Expand All @@ -33,37 +29,35 @@ def sample_same_speaker_as_segment(sample_audio: str, segment: str) -> bool:
score, prediction = model.verify_files(sample_audio, segment)
print(score, prediction)
# return bool(score[0] > 0.6)
return prediction[0]
return bool(prediction[0])
except Exception as e:
print(e)
return False


def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegment], profile_path: str):
print('classify_segments')
# TODO: for better performance probably use segments before merging them together
matches = [False] * len(transcript_segments)
if not profile_path:
return matches

for i, segment in enumerate(transcript_segments):
file_name = os.path.basename(audio_file)
temporal_file = f"_temp/{file_name}_{segment.start}_{segment.end}.wav"
# temporal_file = f"_temp/{i}.wav"
AudioSegment.from_wav(audio_file)[segment.start * 1000:segment.end * 1000].export(temporal_file, format="wav")

is_user = sample_same_speaker_as_segment(temporal_file, profile_path)
print('Matches', is_user, temporal_file)
matches[i] = is_user

os.remove(temporal_file)
# temporal_file = f'_temp/{i}-{is_user}.wav'
# AudioSegment.from_wav(audio_file)[segment.start * 1000:segment.end * 1000].export(temporal_file, format="wav")
return matches


app = App(name='speech_profile')
image = (
Image.debian_slim()
.apt_install('ffmpeg')
.pip_install("torch")
.pip_install("torchaudio")
.pip_install("speechbrain")
Expand All @@ -75,31 +69,32 @@ def classify_segments(audio_file: str, transcript_segments: List[TranscriptSegme

@app.function(
image=image,
keep_warm=0,
keep_warm=1,
memory=(1024, 2048),
allow_concurrent_inputs=4,
cpu=4,
gpu=modal.gpu.T4(count=1),
secrets=[Secret.from_name('huggingface-token')],
)
@web_endpoint(method='POST')
async def upload_files_and_segments(
async def endpoint(
profile_path: UploadFile = File(...),
audio_file: UploadFile = File(...),
segments: str = Form(...)
) -> ResponseModel:
uid = uuid.uuid4()
profile_file_path = f"_temp/{uid}_{profile_path.filename}"
) -> List[bool]:
profile_file_path = profile_path.filename

with open(profile_file_path, 'wb') as f:
f.write(profile_path.file.read())

audio_file_path = f"_temp/{uid}_{audio_file.filename}"
audio_file_path = audio_file.filename
with open(audio_file_path, 'wb') as f:
f.write(audio_file.file.read())

segments_data = json.loads(segments)
transcript_segments = [TranscriptSegment(**segment) for segment in segments_data]

try:
# Call the classify_segments function with the file paths
result = classify_segments(audio_file_path, transcript_segments, profile_file_path)
return result
finally:
Expand Down
26 changes: 8 additions & 18 deletions backend/routers/memories.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import database.memories as memories_db
from database.vector_db import delete_vector
from models.memory import *
from utils._deprecated.speaker_profile import classify_segments
from utils.memories.location import get_google_maps_location
from utils.memories.process_memory import process_memory, process_user_emotion
from utils.other import endpoints as auth
from utils.other.storage import upload_postprocessing_audio, \
delete_postprocessing_audio, get_profile_audio_if_exists
from utils.plugins import trigger_external_integrations
from utils.stt.pre_recorded import fal_whisperx, fal_postprocessing
from utils.stt.speech_profile import get_speech_profile_matching_predictions
from utils.stt.vad import vad_is_empty

router = APIRouter()
Expand Down Expand Up @@ -78,7 +78,6 @@ def postprocess_memory(
TODO: Try Nvidia Nemo ASR as suggested by @jhonnycombs https://huggingface.co/spaces/hf-audio/open_asr_leaderboard
TODO: USE soniox here? with speech profile and stuff?
TODO: either do speech profile embeddings or use the profile audio as prefix
TODO: should consider storing non beautified segments, and beautify on read?
"""
memory_data = _get_memory_by_id(uid, memory_id)
Expand Down Expand Up @@ -110,11 +109,10 @@ def postprocess_memory(
try:
aseg = AudioSegment.from_wav(file_path)
profile_duration = 0
profile_path = get_profile_audio_if_exists(uid) if aseg.frame_rate == 16000 else None

signed_url = upload_postprocessing_audio(file_path)

# Ensure delete uploaded file in 15m
# Ensure delete uploaded file in 5m
threads = threading.Thread(target=_delete_postprocessing_audio, args=(file_path,))
threads.start()

Expand All @@ -128,10 +126,6 @@ def postprocess_memory(
memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled)
raise HTTPException(status_code=500, detail="FAL WhisperX failed to process audio")

matches = classify_segments(file_path, segments, profile_path)
for i, segment in enumerate(segments):
segment.is_user = matches[i]

# if new transcript is 90% shorter than the original, cancel post-processing, smth wrong with audio or FAL
count = len(''.join([segment.text.strip() for segment in memory.transcript_segments]))
new_count = len(''.join([segment.text.strip() for segment in segments]))
Expand All @@ -140,6 +134,11 @@ def postprocess_memory(
memories_db.set_postprocessing_status(uid, memory.id, PostProcessingStatus.canceled)
raise HTTPException(status_code=500, detail="Post-processed transcript is too short")

profile_path = get_profile_audio_if_exists(uid) if aseg.frame_rate == 16000 else None
matches = get_speech_profile_matching_predictions(file_path, profile_path, [s.dict() for s in segments])
for i, segment in enumerate(segments):
segment.is_user = matches[i]

# TODO: post llm process here would be great, sometimes whisper x outputs without punctuation
# Store previous and new segments in DB as collection.
memories_db.store_model_segments_result(uid, memory.id, 'deepgram_streaming', memory.transcript_segments)
Expand All @@ -162,17 +161,8 @@ def postprocess_memory(
return result


# audio_path = '_temp/f39a99f8-f90c-4a04-800f-4b99a85d4e79_recording-20240824_210026.wav'
# # signed_url = upload_postprocessing_audio(audio_path)
# # words = fal_whisperx(signed_url, 2, )
# words = [{'timestamp': [0.0, 1.14], 'text': ' Hey,', 'speaker': 'SPEAKER_01'}, {'timestamp': [1.14, 1.64], 'text': ' Ggpt,', 'speaker': 'SPEAKER_01'}, {'timestamp': [1.64, 1.84], 'text': " how's", 'speaker': 'SPEAKER_01'}, {'timestamp': [1.84, 1.86], 'text': ' it', 'speaker': 'SPEAKER_01'}, {'timestamp': [1.86, 4.42], 'text': ' going?', 'speaker': 'SPEAKER_01'}, {'timestamp': [4.42, 5.66], 'text': ' That', 'speaker': 'SPEAKER_00'}, {'timestamp': [5.66, 5.98], 'text': ' sounds', 'speaker': 'SPEAKER_00'}, {'timestamp': [5.98, 6.2], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [6.2, 6.32], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [6.32, 6.62], 'text': ' unique', 'speaker': 'SPEAKER_00'}, {'timestamp': [6.62, 7.04], 'text': ' way', 'speaker': 'SPEAKER_00'}, {'timestamp': [7.04, 7.26], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [7.26, 7.58], 'text': ' describe', 'speaker': 'SPEAKER_00'}, {'timestamp': [7.58, 8.16], 'text': ' someone.', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.16, 8.54], 'text': ' What', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.54, 8.68], 'text': ' do', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.68, 8.76], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.76, 8.94], 'text': ' mean', 'speaker': 'SPEAKER_00'}, {'timestamp': [8.94, 9.1], 'text': ' by', 'speaker': 'SPEAKER_00'}, {'timestamp': [9.1, 9.42], 'text': ' cozy', 'speaker': 'SPEAKER_00'}, {'timestamp': [9.42, 10.56], 'text': ' gun?', 'speaker': 'SPEAKER_00'}, {'timestamp': [10.56, 11.38], 'text': ' No,', 'speaker': 'SPEAKER_01'}, {'timestamp': [11.38, 11.42], 'text': ' I', 'speaker': 'SPEAKER_01'}, {'timestamp': [11.42, 11.68], 'text': ' said', 'speaker': 'SPEAKER_01'}, {'timestamp': [11.68, 12.24], 'text': ' Chat', 'speaker': 'SPEAKER_01'}, {'timestamp': [12.24, 13.3], 'text': ' Gpt,', 'speaker': 'SPEAKER_01'}, {'timestamp': [13.3, 13.74], 'text': ' your', 'speaker': 'SPEAKER_01'}, {'timestamp': [13.74, 14.28], 'text': ' name.', 'speaker': 'SPEAKER_01'}, {'timestamp': [14.28, 17.84], 'text': ' I', 'speaker': 'SPEAKER_00'}, {'timestamp': [17.84, 18.04], 'text': ' got', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.04, 18.24], 'text': ' it.', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.24, 18.56], 'text': ' So', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.56, 18.8], 'text': ' Chat', 'speaker': 'SPEAKER_00'}, {'timestamp': [18.8, 19.16], 'text': ' Gpt,', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.16, 19.42], 'text': " she's", 'speaker': 'SPEAKER_00'}, {'timestamp': [19.42, 19.44], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.44, 19.68], 'text': ' petite', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.68, 19.94], 'text': ' cozy', 'speaker': 'SPEAKER_00'}, {'timestamp': [19.94, 20.36], 'text': ' gun.', 'speaker': 'SPEAKER_00'}, {'timestamp': [20.36, 20.86], 'text': " That's", 'speaker': 'SPEAKER_00'}, {'timestamp': [20.86, 21.08], 'text': ' quite', 'speaker': 'SPEAKER_00'}, {'timestamp': [21.08, 21.24], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [21.24, 21.84], 'text': ' metaphorical', 'speaker': 'SPEAKER_00'}, {'timestamp': [21.84, 22.34], 'text': ' description.', 'speaker': 'SPEAKER_00'}, {'timestamp': [22.34, 22.7], 'text': ' Are', 'speaker': 'SPEAKER_00'}, {'timestamp': [22.7, 22.88], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [22.88, 23.26], 'text': ' referring', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.26, 23.42], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.42, 23.64], 'text': ' me', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.64, 23.8], 'text': ' as', 'speaker': 'SPEAKER_00'}, {'timestamp': [23.8, 24.04], 'text': ' being', 'speaker': 'SPEAKER_00'}, {'timestamp': [24.04, 24.36], 'text': ' small', 'speaker': 'SPEAKER_00'}, {'timestamp': [24.36, 24.68], 'text': ' but', 'speaker': 'SPEAKER_00'}, {'timestamp': [24.68, 25.06], 'text': ' impactful', 'speaker': 'SPEAKER_00'}, {'timestamp': [25.06, 25.52], 'text': ' or', 'speaker': 'SPEAKER_00'}, {'timestamp': [25.52, 26.32], 'text': ' comforting?', 'speaker': 'SPEAKER_00'}, {'timestamp': [26.32, 28.12], 'text': ' Yeah,', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.3, 41.3], 'text': ' exactly', 'speaker': None}, {'timestamp': [41.3, 41.3], 'text': ' that.', 'speaker': None}, {'timestamp': [41.3, 41.3], 'text': ' Can', 'speaker': None}, {'timestamp': [41.3, 41.52], 'text': ' you', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.52, 41.66], 'text': ' tell', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.66, 41.78], 'text': ' me', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.78, 41.9], 'text': ' a', 'speaker': 'SPEAKER_01'}, {'timestamp': [41.9, 42.16], 'text': ' story', 'speaker': 'SPEAKER_01'}, {'timestamp': [42.16, 42.34], 'text': ' or', 'speaker': 'SPEAKER_01'}, {'timestamp': [69.98, 69.98], 'text': ' something', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' fun', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' that', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' learned', 'speaker': 'SPEAKER_00'}, {'timestamp': [69.98, 69.98], 'text': ' recently?', 'speaker': 'SPEAKER_00'}, {'timestamp': [70.08, 70.34], 'text': ' while', 'speaker': 'SPEAKER_00'}, {'timestamp': [70.34, 70.58], 'text': ' hunting', 'speaker': 'SPEAKER_00'}, {'timestamp': [70.58, 70.9], 'text': ' together', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.28, 71.56], 'text': " it's", 'speaker': 'SPEAKER_00'}, {'timestamp': [71.56, 71.7], 'text': ' kind', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.7, 71.82], 'text': ' of', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.82, 71.96], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [71.96, 72.12], 'text': ' an', 'speaker': 'SPEAKER_00'}, {'timestamp': [72.12, 72.42], 'text': ' octopus', 'speaker': 'SPEAKER_00'}, {'timestamp': [72.42, 72.84], 'text': ' saying', 'speaker': 'SPEAKER_00'}, {'timestamp': [72.84, 73.32], 'text': ' hey', 'speaker': 'SPEAKER_00'}, {'timestamp': [73.32, 73.58], 'text': ' back', 'speaker': 'SPEAKER_00'}, {'timestamp': [73.58, 73.88], 'text': ' off', 'speaker': 'SPEAKER_00'}, {'timestamp': [73.88, 74.62], 'text': ' but', 'speaker': 'SPEAKER_00'}, {'timestamp': [74.62, 74.78], 'text': ' with', 'speaker': 'SPEAKER_00'}, {'timestamp': [74.78, 74.9], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [74.9, 75.06], 'text': ' little', 'speaker': 'SPEAKER_00'}, {'timestamp': [75.06, 75.24], 'text': ' more', 'speaker': 'SPEAKER_00'}, {'timestamp': [75.24, 75.62], 'text': ' force', 'speaker': 'SPEAKER_00'}, {'timestamp': [75.62, 76.48], 'text': ' this', 'speaker': 'SPEAKER_00'}, {'timestamp': [76.48, 76.82], 'text': ' playful', 'speaker': 'SPEAKER_00'}, {'timestamp': [76.82, 77.16], 'text': ' behavior', 'speaker': 'SPEAKER_00'}, {'timestamp': [77.5, 77.62], 'text': ' shows', 'speaker': 'SPEAKER_00'}, {'timestamp': [77.62, 77.94], 'text': ' just', 'speaker': 'SPEAKER_00'}, {'timestamp': [77.94, 78.24], 'text': ' how', 'speaker': 'SPEAKER_00'}, {'timestamp': [78.24, 78.8], 'text': ' intelligent', 'speaker': 'SPEAKER_00'}, {'timestamp': [78.8, 79.3], 'text': ' and', 'speaker': 'SPEAKER_00'}, {'timestamp': [79.3, 79.6], 'text': ' curious', 'speaker': 'SPEAKER_00'}, {'timestamp': [79.6, 80.4], 'text': ' octopuses', 'speaker': 'SPEAKER_00'}, {'timestamp': [80.4, 80.7], 'text': ' are', 'speaker': 'SPEAKER_00'}, {'timestamp': [80.7, 81.2], 'text': ' what', 'speaker': 'SPEAKER_00'}, {'timestamp': [81.34, 81.66], 'text': ' you?', 'speaker': 'SPEAKER_00'}, {'timestamp': [81.66, 82.22], 'text': ' Learned', 'speaker': 'SPEAKER_00'}, {'timestamp': [82.22, 82.36], 'text': ' anything', 'speaker': 'SPEAKER_00'}, {'timestamp': [82.36, 82.68], 'text': ' fun', 'speaker': 'SPEAKER_00'}, {'timestamp': [82.68, 83.44], 'text': ' lately?', 'speaker': 'SPEAKER_00'}, {'timestamp': [83.44, 85.84], 'text': ' Yeah,', 'speaker': 'SPEAKER_01'}, {'timestamp': [85.84, 86.2], 'text': ' about...', 'speaker': 'SPEAKER_01'}, {'timestamp': [86.2, 86.88], 'text': ' a', 'speaker': 'SPEAKER_01'}, {'timestamp': [86.88, 87.1], 'text': ' little', 'speaker': 'SPEAKER_01'}, {'timestamp': [87.1, 87.28], 'text': ' bit', 'speaker': 'SPEAKER_01'}, {'timestamp': [87.28, 87.84], 'text': ' about...', 'speaker': 'SPEAKER_01'}, {'timestamp': [87.84, 90.42], 'text': ' embeddings.', 'speaker': 'SPEAKER_01'}, {'timestamp': [90.42, 92.44], 'text': ' Sounds', 'speaker': 'SPEAKER_00'}, {'timestamp': [92.44, 92.72], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [92.72, 92.9], 'text': " you're", 'speaker': 'SPEAKER_00'}, {'timestamp': [92.9, 93.04], 'text': ' on', 'speaker': 'SPEAKER_00'}, {'timestamp': [93.04, 93.14], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [93.14, 93.66], 'text': ' something.', 'speaker': 'SPEAKER_00'}, {'timestamp': [93.66, 94.3], 'text': ' What', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.3, 94.44], 'text': ' did', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.44, 94.58], 'text': ' you', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.58, 94.78], 'text': ' learn', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.78, 94.9], 'text': ' a', 'speaker': 'SPEAKER_00'}, {'timestamp': [94.9, 95.0], 'text': ' little', 'speaker': 'SPEAKER_00'}, {'timestamp': [95.0, 95.18], 'text': ' bit', 'speaker': 'SPEAKER_00'}, {'timestamp': [95.18, 95.66], 'text': ' about?', 'speaker': 'SPEAKER_00'}, {'timestamp': [95.66, 96.32], 'text': " I'm", 'speaker': 'SPEAKER_00'}, {'timestamp': [96.32, 96.54], 'text': ' curious', 'speaker': 'SPEAKER_00'}, {'timestamp': [96.54, 96.76], 'text': ' to', 'speaker': 'SPEAKER_00'}, {'timestamp': [96.76, 97.26], 'text': ' hear.', 'speaker': 'SPEAKER_00'}, {'timestamp': [97.26, 100.7], 'text': ' Embeddings', 'speaker': 'SPEAKER_01'}, {'timestamp': [100.7, 100.9], 'text': ' and', 'speaker': 'SPEAKER_01'}, {'timestamp': [100.9, 101.18], 'text': ' wide', 'speaker': 'SPEAKER_01'}, {'timestamp': [101.18, 101.86], 'text': ' vision', 'speaker': 'SPEAKER_01'}, {'timestamp': [101.86, 102.28], 'text': ' models', 'speaker': 'SPEAKER_01'}, {'timestamp': [102.28, 102.84], 'text': ' like', 'speaker': 'SPEAKER_01'}, {'timestamp': [102.84, 103.96], 'text': ' GPT', 'speaker': 'SPEAKER_01'}, {'timestamp': [103.96, 104.24], 'text': '-4', 'speaker': 'SPEAKER_01'}, {'timestamp': [104.24, 104.72], 'text': ' vision', 'speaker': 'SPEAKER_01'}, {'timestamp': [104.72, 106.24], 'text': ' do', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.24, 106.4], 'text': ' not', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.4, 106.7], 'text': ' work', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.7, 106.84], 'text': ' as', 'speaker': 'SPEAKER_01'}, {'timestamp': [106.84, 107.08], 'text': ' ideally', 'speaker': 'SPEAKER_01'}, {'timestamp': [107.08, 107.38], 'text': ' and', 'speaker': 'SPEAKER_01'}, {'timestamp': [107.38, 107.54], 'text': " it's", 'speaker': 'SPEAKER_01'}, {'timestamp': [107.54, 107.72], 'text': ' like', 'speaker': 'SPEAKER_01'}, {'timestamp': [107.72, 108.28], 'text': ' shitting.', 'speaker': 'SPEAKER_01'}, {'timestamp': [108.28, 112.84], 'text': ' It', 'speaker': 'SPEAKER_00'}, {'timestamp': [112.84, 113.16], 'text': ' sounds', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.16, 113.36], 'text': ' like', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.36, 113.56], 'text': " you've", 'speaker': 'SPEAKER_00'}, {'timestamp': [113.56, 113.68], 'text': ' been', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.68, 113.96], 'text': ' diving', 'speaker': 'SPEAKER_00'}, {'timestamp': [113.96, 114.28], 'text': ' into', 'speaker': 'SPEAKER_00'}, {'timestamp': [114.28, 114.52], 'text': ' some', 'speaker': 'SPEAKER_00'}, {'timestamp': [114.52, 114.84], 'text': ' deeper', 'speaker': 'SPEAKER_00'}, {'timestamp': [114.84, 115.3], 'text': ' AI...', 'speaker': 'SPEAKER_00'}]
# segments = fal_postprocessing(words, 0, 0)
# print(segments)
# classify_segments(audio_path, segments, '_temp/caLCFj7IisV85UX9XrrV1aVf3pk1_speech_profile.wav')


def _delete_postprocessing_audio(file_path):
time.sleep(900) # 15m
time.sleep(300) # 5 min
delete_postprocessing_audio(file_path)


Expand Down
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit aed4865

Please sign in to comment.