Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
launch.json
__pycache__
voxcpm.egg-info
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ By default, when you first run the script, the model will be downloaded automati
### 2. Basic Usage
```python
import soundfile as sf
import numpy as np
from voxcpm import VoxCPM

model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B")

# Non-streaming
wav = model.generate(
text="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.",
prompt_wav_path=None, # optional: path to a prompt speech for voice cloning
Expand All @@ -81,6 +83,18 @@ wav = model.generate(

sf.write("output.wav", wav, 16000)
print("saved: output.wav")

# Streaming
chunks = []
for chunk in model.generate_streaming(
text = "Streaming text to speech is easy with VoxCPM!",
# supports same args as above
):
chunks.append(chunk)
wav = np.concatenate(chunks)

sf.write("output_streaming.wav", wav, 16000)
print("saved: output_streaming.wav")
```

### 3. CLI Usage
Expand Down
34 changes: 26 additions & 8 deletions src/voxcpm/core.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import torch
import torchaudio
import os
import re
import tempfile
import numpy as np
from typing import Generator
from huggingface_hub import snapshot_download
from .model.voxcpm import VoxCPMModel

Expand All @@ -11,6 +11,7 @@ def __init__(self,
voxcpm_model_path : str,
zipenhancer_model_path : str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
enable_denoiser : bool = True,
optimize: bool = True,
):
"""Initialize VoxCPM TTS pipeline.

Expand All @@ -21,9 +22,10 @@ def __init__(self,
zipenhancer_model_path: ModelScope acoustic noise suppression model
id or local path. If None, denoiser will not be initialized.
enable_denoiser: Whether to initialize the denoiser pipeline.
optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
Copy link

Copilot AI Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter description mentions 'torch.compile' but the actual optimization logic in the model uses different mechanisms (torch_tensorrt, torch2trt). The documentation should accurately describe what optimization actually does or be more generic.

Suggested change
optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
optimize: Whether to optimize the model for inference speed. True by default, but can be disabled for debugging.

Copilot uses AI. Check for mistakes.
"""
print(f"voxcpm_model_path: {voxcpm_model_path}, zipenhancer_model_path: {zipenhancer_model_path}, enable_denoiser: {enable_denoiser}")
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path, optimize=optimize)
self.text_normalizer = None
if enable_denoiser and zipenhancer_model_path is not None:
from .zipenhancer import ZipEnhancer
Expand All @@ -43,6 +45,7 @@ def from_pretrained(cls,
zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
cache_dir: str = None,
local_files_only: bool = False,
**kwargs,
):
"""Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.

Expand All @@ -54,6 +57,8 @@ def from_pretrained(cls,
cache_dir: Custom cache directory for the snapshot.
local_files_only: If True, only use local files and do not attempt
to download.
Kwargs:
Additional keyword arguments passed to the ``VoxCPM`` constructor.

Returns:
VoxCPM: Initialized instance whose ``voxcpm_model_path`` points to
Expand Down Expand Up @@ -82,9 +87,16 @@ def from_pretrained(cls,
voxcpm_model_path=local_path,
zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
enable_denoiser=load_denoiser,
**kwargs,
)

def generate(self,
def generate(self, *args, **kwargs) -> np.ndarray:
return next(self._generate(*args, streaming=False, **kwargs))

def generate_streaming(self, *args, **kwargs) -> Generator[np.ndarray, None, None]:
return self._generate(*args, streaming=True, **kwargs)

def _generate(self,
text : str,
prompt_wav_path : str = None,
prompt_text : str = None,
Expand All @@ -96,7 +108,8 @@ def generate(self,
retry_badcase : bool = True,
retry_badcase_max_times : int = 3,
retry_badcase_ratio_threshold : float = 6.0,
):
streaming: bool = False,
) -> Generator[np.ndarray, None, None]:
"""Synthesize speech for the given text and return a single waveform.

This method optionally builds and reuses a prompt cache. If an external
Expand All @@ -118,8 +131,11 @@ def generate(self,
retry_badcase: Whether to retry badcase.
retry_badcase_max_times: Maximum number of times to retry badcase.
retry_badcase_ratio_threshold: Threshold for audio-to-text ratio.
streaming: Whether to return a generator of audio chunks.
Returns:
numpy.ndarray: 1D waveform array (float32) on CPU.
Generator of numpy.ndarray: 1D waveform array (float32) on CPU.
Yields audio chunks for each generations step if ``streaming=True``,
otherwise yields a single array containing the final audio.
"""
if not text.strip() or not isinstance(text, str):
raise ValueError("target text must be a non-empty string")
Expand Down Expand Up @@ -155,7 +171,7 @@ def generate(self,
self.text_normalizer = TextNormalizer()
text = self.text_normalizer.normalize(text)

wav, target_text_token, generated_audio_feat = self.tts_model.generate_with_prompt_cache(
generate_result = self.tts_model._generate_with_prompt_cache(
target_text=text,
prompt_cache=fixed_prompt_cache,
min_len=2,
Expand All @@ -165,9 +181,11 @@ def generate(self,
retry_badcase=retry_badcase,
retry_badcase_max_times=retry_badcase_max_times,
retry_badcase_ratio_threshold=retry_badcase_ratio_threshold,
streaming=streaming,
)

return wav.squeeze(0).cpu().numpy()
for wav, _, _ in generate_result:
yield wav.squeeze(0).cpu().numpy()

finally:
if temp_prompt_wav_path and os.path.exists(temp_prompt_wav_path):
Expand Down
Loading