Cleaned up some code and fixed an error in the readme

remsky · Feb 14, 2025 · 9c1ced2 · 9c1ced2
1 parent 34acb17
commit 9c1ced2
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 400 deletions.
diff --git a/README.md b/README.md
@@ -342,7 +342,7 @@ Key Performance Metrics:
 <summary>GPU Vs. CPU</summary>
 
 ```bash
-# GPU: Requires NVIDIA GPU with CUDA 12.1 support (~35x-100x realtime speed)
+# GPU: Requires NVIDIA GPU with CUDA 12.8 support (~35x-100x realtime speed)
 cd docker/gpu
 docker compose up --build
 

diff --git a/api/src/routers/development.py b/api/src/routers/development.py
@@ -321,165 +321,12 @@ async def single_output():
         )
     except Exception as e:
         # Handle unexpected errors
-        logger.error(f"Unexpected error in speech generation: {str(e)}")
+        logger.error(f"Unexpected error in captioned speech generation: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail={
                 "error": "processing_error",
                 "message": str(e),
                 "type": "server_error",
             },
-        )
-
-    """
-    try:
-        # Set content type based on format
-        content_type = {
-            "mp3": "audio/mpeg",
-            "opus": "audio/opus",
-            "aac": "audio/aac",
-            "flac": "audio/flac",
-            "wav": "audio/wav",
-            "pcm": "audio/pcm",
-        }.get(request.response_format, f"audio/{request.response_format}")
-
-        # Create streaming audio writer and normalizer
-        writer = StreamingAudioWriter(
-            format=request.response_format, sample_rate=24000, channels=1
-        )
-        normalizer = AudioNormalizer()
-
-        # Get voice path
-        voice_name, voice_path = await tts_service._get_voice_path(request.voice)
-
-        # Use provided lang_code or determine from voice name
-        pipeline_lang_code = request.lang_code if request.lang_code else request.voice[0].lower()
-        logger.info(
-            f"Using lang_code '{pipeline_lang_code}' for voice '{voice_name}' in text chunking"
-        )
-
-        # Get backend and pipeline
-        backend = tts_service.model_manager.get_backend()
-        pipeline = backend._get_pipeline(pipeline_lang_code)
-
-        # Create temp file writer for timestamps
-        temp_writer = TempFileWriter("json")
-        await temp_writer.__aenter__()  # Initialize temp file
-        # Get just the filename without the path
-        timestamps_filename = Path(temp_writer.download_path).name
-
-        # Initialize variables for timestamps
-        word_timestamps = []
-        current_offset = 0.0
-
-        async def generate_chunks():
-            nonlocal current_offset, word_timestamps
-            try:
-                # Process text in chunks with smart splitting
-                async for chunk_text, tokens in smart_split(request.input):
-                    # Process chunk with pipeline
-                    for result in pipeline(chunk_text, voice=voice_path, speed=request.speed):
-                        if result.audio is not None:
-                            # Process timestamps for this chunk
-                            if hasattr(result, "tokens") and result.tokens and result.pred_dur is not None:
-                                try:
-                                    # Join timestamps for this chunk's tokens
-                                    KPipeline.join_timestamps(result.tokens, result.pred_dur)
-
-                                    # Add timestamps with offset
-                                    for token in result.tokens:
-                                        if not all(
-                                            hasattr(token, attr)
-                                            for attr in ["text", "start_ts", "end_ts"]
-                                        ):
-                                            continue
-                                        if not token.text or not token.text.strip():
-                                            continue
-
-                                        # Apply offset to timestamps
-                                        start_time = float(token.start_ts) + current_offset
-                                        end_time = float(token.end_ts) + current_offset
-
-                                        word_timestamps.append(
-                                            {
-                                                "word": str(token.text).strip(),
-                                                "start_time": start_time,
-                                                "end_time": end_time,
-                                            }
-                                        )
-
-                                    # Update offset for next chunk
-                                    chunk_duration = float(result.pred_dur.sum()) / 80  # Convert frames to seconds
-                                    current_offset = max(current_offset + chunk_duration, end_time)
-
-                                except Exception as e:
-                                    logger.error(f"Failed to process timestamps for chunk: {e}")
-
-                            # Process audio
-                            audio_chunk = result.audio.numpy()
-                            normalized_audio = await normalizer.normalize(audio_chunk)
-                            chunk_bytes = writer.write_chunk(normalized_audio)
-                            if chunk_bytes:
-                                yield chunk_bytes
-
-                # Write timestamps to temp file
-                timestamps_json = json.dumps(word_timestamps)
-                await temp_writer.write(timestamps_json.encode())
-                await temp_writer.finalize()
-
-                # Finalize audio
-                final_bytes = writer.write_chunk(finalize=True)
-                if final_bytes:
-                    yield final_bytes
-
-            except Exception as e:
-                logger.error(f"Error in audio generation: {str(e)}")
-                # Clean up writer on error
-                writer.write_chunk(finalize=True)
-                await temp_writer.__aexit__(type(e), e, e.__traceback__)
-                # Re-raise the original exception
-                raise
-
-        return StreamingResponse(
-            generate_chunks(),
-            media_type=content_type,
-            headers={
-                "Content-Disposition": f"attachment; filename=speech.{request.response_format}",
-                "X-Accel-Buffering": "no",
-                "Cache-Control": "no-cache",
-                "Transfer-Encoding": "chunked",
-                "X-Timestamps-Path": timestamps_filename,
-            },
-        )
-
-    except ValueError as e:
-        logger.warning(f"Invalid request: {str(e)}")
-        raise HTTPException(
-            status_code=400,
-            detail={
-                "error": "validation_error",
-                "message": str(e),
-                "type": "invalid_request_error",
-            },
-        )
-    except RuntimeError as e:
-        logger.error(f"Processing error: {str(e)}")
-        raise HTTPException(
-            status_code=500,
-            detail={
-                "error": "processing_error",
-                "message": str(e),
-                "type": "server_error",
-            },
-        )
-    except Exception as e:
-        logger.error(f"Unexpected error in speech generation: {str(e)}")
-        raise HTTPException(
-            status_code=500,
-            detail={
-                "error": "processing_error",
-                "message": str(e),
-                "type": "server_error",
-            },
-        )
-    """
+        )