Skip to content

Commit

Permalink
Cleaned up some code and fixed an error in the readme
Browse files Browse the repository at this point in the history
  • Loading branch information
fireblade2534 committed Feb 14, 2025
1 parent 34acb17 commit 9c1ced2
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 400 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ Key Performance Metrics:
<summary>GPU Vs. CPU</summary>
```bash
# GPU: Requires NVIDIA GPU with CUDA 12.1 support (~35x-100x realtime speed)
# GPU: Requires NVIDIA GPU with CUDA 12.8 support (~35x-100x realtime speed)
cd docker/gpu
docker compose up --build

Expand Down
157 changes: 2 additions & 155 deletions api/src/routers/development.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,165 +321,12 @@ async def single_output():
)
except Exception as e:
# Handle unexpected errors
logger.error(f"Unexpected error in speech generation: {str(e)}")
logger.error(f"Unexpected error in captioned speech generation: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "processing_error",
"message": str(e),
"type": "server_error",
},
)

"""
try:
# Set content type based on format
content_type = {
"mp3": "audio/mpeg",
"opus": "audio/opus",
"aac": "audio/aac",
"flac": "audio/flac",
"wav": "audio/wav",
"pcm": "audio/pcm",
}.get(request.response_format, f"audio/{request.response_format}")
# Create streaming audio writer and normalizer
writer = StreamingAudioWriter(
format=request.response_format, sample_rate=24000, channels=1
)
normalizer = AudioNormalizer()
# Get voice path
voice_name, voice_path = await tts_service._get_voice_path(request.voice)
# Use provided lang_code or determine from voice name
pipeline_lang_code = request.lang_code if request.lang_code else request.voice[0].lower()
logger.info(
f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking"
)
# Get backend and pipeline
backend = tts_service.model_manager.get_backend()
pipeline = backend._get_pipeline(pipeline_lang_code)
# Create temp file writer for timestamps
temp_writer = TempFileWriter("json")
await temp_writer.__aenter__() # Initialize temp file
# Get just the filename without the path
timestamps_filename = Path(temp_writer.download_path).name
# Initialize variables for timestamps
word_timestamps = []
current_offset = 0.0
async def generate_chunks():
nonlocal current_offset, word_timestamps
try:
# Process text in chunks with smart splitting
async for chunk_text, tokens in smart_split(request.input):
# Process chunk with pipeline
for result in pipeline(chunk_text, voice=voice_path, speed=request.speed):
if result.audio is not None:
# Process timestamps for this chunk
if hasattr(result, "tokens") and result.tokens and result.pred_dur is not None:
try:
# Join timestamps for this chunk's tokens
KPipeline.join_timestamps(result.tokens, result.pred_dur)
# Add timestamps with offset
for token in result.tokens:
if not all(
hasattr(token, attr)
for attr in ["text", "start_ts", "end_ts"]
):
continue
if not token.text or not token.text.strip():
continue
# Apply offset to timestamps
start_time = float(token.start_ts) + current_offset
end_time = float(token.end_ts) + current_offset
word_timestamps.append(
{
"word": str(token.text).strip(),
"start_time": start_time,
"end_time": end_time,
}
)
# Update offset for next chunk
chunk_duration = float(result.pred_dur.sum()) / 80 # Convert frames to seconds
current_offset = max(current_offset + chunk_duration, end_time)
except Exception as e:
logger.error(f"Failed to process timestamps for chunk: {e}")
# Process audio
audio_chunk = result.audio.numpy()
normalized_audio = await normalizer.normalize(audio_chunk)
chunk_bytes = writer.write_chunk(normalized_audio)
if chunk_bytes:
yield chunk_bytes
# Write timestamps to temp file
timestamps_json = json.dumps(word_timestamps)
await temp_writer.write(timestamps_json.encode())
await temp_writer.finalize()
# Finalize audio
final_bytes = writer.write_chunk(finalize=True)
if final_bytes:
yield final_bytes
except Exception as e:
logger.error(f"Error in audio generation: {str(e)}")
# Clean up writer on error
writer.write_chunk(finalize=True)
await temp_writer.__aexit__(type(e), e, e.__traceback__)
# Re-raise the original exception
raise
return StreamingResponse(
generate_chunks(),
media_type=content_type,
headers={
"Content-Disposition": f"attachment; filename=speech.{request.response_format}",
"X-Accel-Buffering": "no",
"Cache-Control": "no-cache",
"Transfer-Encoding": "chunked",
"X-Timestamps-Path": timestamps_filename,
},
)
except ValueError as e:
logger.warning(f"Invalid request: {str(e)}")
raise HTTPException(
status_code=400,
detail={
"error": "validation_error",
"message": str(e),
"type": "invalid_request_error",
},
)
except RuntimeError as e:
logger.error(f"Processing error: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "processing_error",
"message": str(e),
"type": "server_error",
},
)
except Exception as e:
logger.error(f"Unexpected error in speech generation: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"error": "processing_error",
"message": str(e),
"type": "server_error",
},
)
"""
)
Loading

0 comments on commit 9c1ced2

Please sign in to comment.