diff --git a/app_rvc.py b/app_rvc.py index 23b36bd..ea9c2eb 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -270,9 +270,16 @@ def check_openai_api_key(): class SoniTranslate(SoniTrCache): - def __init__(self, dev=False): + def __init__(self, cpu_mode=False): super().__init__() - self.device = "cuda" if torch.cuda.is_available() else "cpu" + if cpu_mode: + os.environ["SONITR_DEVICE"] = "cpu" + else: + os.environ["SONITR_DEVICE"] = ( + "cuda" if torch.cuda.is_available() else "cpu" + ) + + self.device = os.environ.get("SONITR_DEVICE") self.result_diarize = None self.align_language = None self.result_source_lang = None @@ -282,7 +289,7 @@ def __init__(self, dev=False): os.environ["VOICES_MODELS"] = "DISABLE" os.environ["VOICES_MODELS_WORKERS"] = "1" - self.vci = ClassVoices() + self.vci = ClassVoices(only_cpu=cpu_mode) self.tts_voices = self.get_tts_voice_list() @@ -1597,7 +1604,7 @@ def get_subs_path(type_subs): ) whisper_model_default = ( "large-v3" - if torch.cuda.is_available() + if SoniTr.device == "cuda" else "medium" ) @@ -1610,7 +1617,7 @@ def get_subs_path(type_subs): ) com_t_opt, com_t_default = ( [COMPUTE_TYPE_GPU, "float16"] - if torch.cuda.is_available() + if SoniTr.device == "cuda" else [COMPUTE_TYPE_CPU, "float32"] ) compute_type = gr.Dropdown( @@ -2555,6 +2562,12 @@ def create_parser(): default="english", help=" Select the language of the interface: english, spanish", ) + parser.add_argument( + "--cpu_mode", + action="store_true", + default=False, + help="Enable CPU mode to run the program without utilizing GPU acceleration.", + ) return parser @@ -2576,7 +2589,7 @@ def create_parser(): models_path, index_path = upload_model_list() - SoniTr = SoniTranslate() + SoniTr = SoniTranslate(cpu_mode=args.cpu_mode) lg_conf = get_language_config(language_data, language=args.language) diff --git a/soni_translate/mdx_net.py b/soni_translate/mdx_net.py index ce67600..1623ecd 100644 --- a/soni_translate/mdx_net.py +++ b/soni_translate/mdx_net.py @@ -119,10 +119,8 @@ class MDX: DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR - DEFAULT_PROCESSOR = 0 if torch.cuda.is_available() else -1 - def __init__( - self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR + self, model_path: str, params: MDXModel, processor=0 ): # Set the device and the provider (CPU or CUDA) self.device = ( @@ -356,14 +354,17 @@ def run_mdx( denoise=False, keep_orig=True, m_threads=2, + device_base="cuda", ): - if torch.cuda.is_available(): + if device_base == "cuda": device = torch.device("cuda:0") + processor_num = 0 device_properties = torch.cuda.get_device_properties(device) vram_gb = device_properties.total_memory / 1024**3 m_threads = 1 if vram_gb < 8 else 2 else: device = torch.device("cpu") + processor_num = -1 m_threads = 1 model_hash = MDX.get_hash(model_path) @@ -377,7 +378,7 @@ def run_mdx( compensation=mp["compensate"], ) - mdx_sess = MDX(model_path, model) + mdx_sess = MDX(model_path, model, processor=processor_num) wave, sr = librosa.load(filename, mono=False, sr=44100) # normalizing input wave gives better output peak = max(np.max(wave), abs(np.min(wave))) @@ -478,6 +479,11 @@ def process_uvr_task( only_voiceless: bool = False, remove_files_output_dir: bool = False, ): + if os.environ.get("SONITR_DEVICE") == "cpu": + device_base = "cpu" + else: + device_base = "cuda" if torch.cuda.is_available() else "cpu" + if remove_files_output_dir: remove_directory_contents(output_dir) @@ -501,6 +507,7 @@ def process_uvr_task( denoise=False, keep_orig=True, exclude_inversion=True, + device_base=device_base, ) logger.info("Vocal Track Isolation and Voiceless Track Separation...") @@ -511,6 +518,7 @@ def process_uvr_task( orig_song_path, denoise=True, keep_orig=True, + device_base=device_base, ) if main_vocals: @@ -523,6 +531,7 @@ def process_uvr_task( suffix="Backup", invert_suffix="Main", denoise=True, + device_base=device_base, ) else: backup_vocals_path, main_vocals_path = None, vocals_path @@ -537,6 +546,7 @@ def process_uvr_task( invert_suffix="DeReverb", exclude_main=True, denoise=True, + device_base=device_base, ) else: vocals_dereverb_path = main_vocals_path diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 2d7996e..810ab85 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -50,8 +50,6 @@ WHISPER_MODELS_PATH = './WHISPER_MODELS' -device = "cuda" if torch.cuda.is_available() else "cpu" - def find_whisper_models(): path = WHISPER_MODELS_PATH @@ -146,7 +144,7 @@ def transcribe_speech( model = whisperx.load_model( asr_model, - device, + os.environ.get("SONITR_DEVICE"), compute_type=compute_type, language=SOURCE_LANGUAGE, asr_options=asr_options, @@ -218,7 +216,7 @@ def align_speech(audio, result): model_a, metadata = whisperx.load_align_model( language_code=result["language"], - device=device, + device=os.environ.get("SONITR_DEVICE"), model_name=None if result["language"] in DAMHF.keys() else EXTRA_ALIGN[result["language"]], @@ -228,7 +226,7 @@ def align_speech(audio, result): model_a, metadata, audio, - device, + os.environ.get("SONITR_DEVICE"), return_char_alignments=True, ) del model_a @@ -286,7 +284,7 @@ def diarize_speech( diarize_model = whisperx.DiarizationPipeline( model_name=model_name, use_auth_token=YOUR_HF_TOKEN, - device=device, + device=os.environ.get("SONITR_DEVICE"), ) except Exception as error: diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 7e67e65..1e619e6 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -24,9 +24,6 @@ import logging from .logging_setup import logger -device = "cuda:0" if torch.cuda.is_available() else "cpu" -torch_dtype_env = torch.float16 if torch.cuda.is_available() else torch.float32 - class TTS_OperationError(Exception): def __init__(self, message="The operation did not complete successfully."): @@ -197,6 +194,9 @@ def segments_bark_tts( from transformers import AutoProcessor, BarkModel from optimum.bettertransformer import BetterTransformer + device = os.environ.get("SONITR_DEVICE") + torch_dtype_env = torch.float16 if device == "cuda" else torch.float32 + # load model bark model = BarkModel.from_pretrained( model_id_bark, torch_dtype=torch_dtype_env @@ -205,7 +205,7 @@ def segments_bark_tts( processor = AutoProcessor.from_pretrained( model_id_bark, return_tensors="pt" ) # , padding=True - if torch.cuda.is_available(): + if device == "cuda": # convert to bettertransformer model = BetterTransformer.transform(model, keep_original_model=False) # enable CPU offload @@ -626,6 +626,7 @@ def segments_coqui_tts( ) # Init TTS + device = os.environ.get("SONITR_DEVICE") model = TTS(model_id_coqui).to(device) sampling_rate = 24000 @@ -729,7 +730,7 @@ def load_piper_model( try: import onnxruntime as rt - if rt.get_device() == "GPU" and torch.cuda.is_available(): + if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda": logger.debug("onnxruntime device > GPU") cuda = True else: @@ -742,6 +743,7 @@ def load_piper_model( # Disable CUDA in Windows if platform.system() == "Windows": + logger.info("Employing CPU exclusivity with Piper TTS") cuda = False if not download_dir: @@ -1107,7 +1109,7 @@ def accelerate_segments( def se_process_audio_segments( - source_seg, tone_color_converter, remove_previous_processed=True + source_seg, tone_color_converter, device, remove_previous_processed=True ): # list wav seg source_audio_segs = glob.glob(f"{source_seg}/*.wav") @@ -1280,6 +1282,7 @@ def toneconverter_openvoice( url=checkpoint_url, path=model_path_openvoice ) + device = os.environ.get("SONITR_DEVICE") tone_color_converter = ToneColorConverter(config_path, device=device) tone_color_converter.load_ckpt(checkpoint_path) @@ -1290,9 +1293,9 @@ def toneconverter_openvoice( path_source_segments, path_target_segments, valid_speakers ): # source_se_path = os.path.join(source_seg, 'se.pth') - source_se = se_process_audio_segments(source_seg, tone_color_converter) + source_se = se_process_audio_segments(source_seg, tone_color_converter, device) # target_se_path = os.path.join(target_seg, 'se.pth') - target_se = se_process_audio_segments(target_seg, tone_color_converter) + target_se = se_process_audio_segments(target_seg, tone_color_converter, device) # Iterate throw segments encode_message = "@MyShell" @@ -1361,6 +1364,8 @@ def toneconverter_freevc( ) logger.info("FreeVC loading model...") + device_id = os.environ.get("SONITR_DEVICE") + device = None if device_id == "cpu" else device_id try: from TTS.api import TTS tts = TTS(