feat: Set parameter --cpu_mode to use the CPU mode R3gm#39

beethogedeon · Apr 21, 2024 · bf199f2 · bf199f2
1 parent d916723
commit bf199f2
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 25 deletions.
diff --git a/app_rvc.py b/app_rvc.py
@@ -270,9 +270,16 @@ def check_openai_api_key():
 
 
 class SoniTranslate(SoniTrCache):
-    def __init__(self, dev=False):
+    def __init__(self, cpu_mode=False):
         super().__init__()
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        if cpu_mode:
+            os.environ["SONITR_DEVICE"] = "cpu"
+        else:
+            os.environ["SONITR_DEVICE"] = (
+                "cuda" if torch.cuda.is_available() else "cpu"
+            )
+
+        self.device = os.environ.get("SONITR_DEVICE")
         self.result_diarize = None
         self.align_language = None
         self.result_source_lang = None
@@ -282,7 +289,7 @@ def __init__(self, dev=False):
 
         os.environ["VOICES_MODELS"] = "DISABLE"
         os.environ["VOICES_MODELS_WORKERS"] = "1"
-        self.vci = ClassVoices()
+        self.vci = ClassVoices(only_cpu=cpu_mode)
 
         self.tts_voices = self.get_tts_voice_list()
 
@@ -1597,7 +1604,7 @@ def get_subs_path(type_subs):
                             )
                             whisper_model_default = (
                                 "large-v3"
-                                if torch.cuda.is_available()
+                                if SoniTr.device == "cuda"
                                 else "medium"
                             )
 
@@ -1610,7 +1617,7 @@ def get_subs_path(type_subs):
                             )
                             com_t_opt, com_t_default = (
                                 [COMPUTE_TYPE_GPU, "float16"]
-                                if torch.cuda.is_available()
+                                if SoniTr.device == "cuda"
                                 else [COMPUTE_TYPE_CPU, "float32"]
                             )
                             compute_type = gr.Dropdown(
@@ -2555,6 +2562,12 @@ def create_parser():
         default="english",
         help=" Select the language of the interface: english, spanish",
     )
+    parser.add_argument(
+        "--cpu_mode",
+        action="store_true",
+        default=False,
+        help="Enable CPU mode to run the program without utilizing GPU acceleration.",
+    )
     return parser
 
 
@@ -2576,7 +2589,7 @@ def create_parser():
 
     models_path, index_path = upload_model_list()
 
-    SoniTr = SoniTranslate()
+    SoniTr = SoniTranslate(cpu_mode=args.cpu_mode)
 
     lg_conf = get_language_config(language_data, language=args.language)
 

diff --git a/soni_translate/mdx_net.py b/soni_translate/mdx_net.py
@@ -119,10 +119,8 @@ class MDX:
     DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
     DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
 
-    DEFAULT_PROCESSOR = 0 if torch.cuda.is_available() else -1
-
     def __init__(
-        self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR
+        self, model_path: str, params: MDXModel, processor=0
     ):
         # Set the device and the provider (CPU or CUDA)
         self.device = (
@@ -356,14 +354,17 @@ def run_mdx(
     denoise=False,
     keep_orig=True,
     m_threads=2,
+    device_base="cuda",
 ):
-    if torch.cuda.is_available():
+    if device_base == "cuda":
         device = torch.device("cuda:0")
+        processor_num = 0
         device_properties = torch.cuda.get_device_properties(device)
         vram_gb = device_properties.total_memory / 1024**3
         m_threads = 1 if vram_gb < 8 else 2
     else:
         device = torch.device("cpu")
+        processor_num = -1
         m_threads = 1
 
     model_hash = MDX.get_hash(model_path)
@@ -377,7 +378,7 @@ def run_mdx(
         compensation=mp["compensate"],
     )
 
-    mdx_sess = MDX(model_path, model)
+    mdx_sess = MDX(model_path, model, processor=processor_num)
     wave, sr = librosa.load(filename, mono=False, sr=44100)
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
@@ -478,6 +479,11 @@ def process_uvr_task(
     only_voiceless: bool = False,
     remove_files_output_dir: bool = False,
 ):
+    if os.environ.get("SONITR_DEVICE") == "cpu":
+        device_base = "cpu"
+    else:
+        device_base = "cuda" if torch.cuda.is_available() else "cpu"
+
     if remove_files_output_dir:
         remove_directory_contents(output_dir)
 
@@ -501,6 +507,7 @@ def process_uvr_task(
             denoise=False,
             keep_orig=True,
             exclude_inversion=True,
+            device_base=device_base,
         )
 
     logger.info("Vocal Track Isolation and Voiceless Track Separation...")
@@ -511,6 +518,7 @@ def process_uvr_task(
         orig_song_path,
         denoise=True,
         keep_orig=True,
+        device_base=device_base,
     )
 
     if main_vocals:
@@ -523,6 +531,7 @@ def process_uvr_task(
             suffix="Backup",
             invert_suffix="Main",
             denoise=True,
+            device_base=device_base,
         )
     else:
         backup_vocals_path, main_vocals_path = None, vocals_path
@@ -537,6 +546,7 @@ def process_uvr_task(
             invert_suffix="DeReverb",
             exclude_main=True,
             denoise=True,
+            device_base=device_base,
         )
     else:
         vocals_dereverb_path = main_vocals_path

diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py
@@ -50,8 +50,6 @@
 
 WHISPER_MODELS_PATH = './WHISPER_MODELS'
 
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
 
 def find_whisper_models():
     path = WHISPER_MODELS_PATH
@@ -146,7 +144,7 @@ def transcribe_speech(
 
     model = whisperx.load_model(
         asr_model,
-        device,
+        os.environ.get("SONITR_DEVICE"),
         compute_type=compute_type,
         language=SOURCE_LANGUAGE,
         asr_options=asr_options,
@@ -218,7 +216,7 @@ def align_speech(audio, result):
 
     model_a, metadata = whisperx.load_align_model(
         language_code=result["language"],
-        device=device,
+        device=os.environ.get("SONITR_DEVICE"),
         model_name=None
         if result["language"] in DAMHF.keys()
         else EXTRA_ALIGN[result["language"]],
@@ -228,7 +226,7 @@ def align_speech(audio, result):
         model_a,
         metadata,
         audio,
-        device,
+        os.environ.get("SONITR_DEVICE"),
         return_char_alignments=True,
     )
     del model_a
@@ -286,7 +284,7 @@ def diarize_speech(
             diarize_model = whisperx.DiarizationPipeline(
                 model_name=model_name,
                 use_auth_token=YOUR_HF_TOKEN,
-                device=device,
+                device=os.environ.get("SONITR_DEVICE"),
             )
 
         except Exception as error:

diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py
@@ -24,9 +24,6 @@
 import logging
 from .logging_setup import logger
 
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype_env = torch.float16 if torch.cuda.is_available() else torch.float32
-
 
 class TTS_OperationError(Exception):
     def __init__(self, message="The operation did not complete successfully."):
@@ -197,6 +194,9 @@ def segments_bark_tts(
     from transformers import AutoProcessor, BarkModel
     from optimum.bettertransformer import BetterTransformer
 
+    device = os.environ.get("SONITR_DEVICE")
+    torch_dtype_env = torch.float16 if device == "cuda" else torch.float32
+
     # load model bark
     model = BarkModel.from_pretrained(
         model_id_bark, torch_dtype=torch_dtype_env
@@ -205,7 +205,7 @@ def segments_bark_tts(
     processor = AutoProcessor.from_pretrained(
         model_id_bark, return_tensors="pt"
     )  # , padding=True
-    if torch.cuda.is_available():
+    if device == "cuda":
         # convert to bettertransformer
         model = BetterTransformer.transform(model, keep_original_model=False)
         # enable CPU offload
@@ -626,6 +626,7 @@ def segments_coqui_tts(
     )
 
     # Init TTS
+    device = os.environ.get("SONITR_DEVICE")
     model = TTS(model_id_coqui).to(device)
     sampling_rate = 24000
 
@@ -729,7 +730,7 @@ def load_piper_model(
     try:
         import onnxruntime as rt
 
-        if rt.get_device() == "GPU" and torch.cuda.is_available():
+        if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda":
             logger.debug("onnxruntime device > GPU")
             cuda = True
         else:
@@ -742,6 +743,7 @@ def load_piper_model(
 
     # Disable CUDA in Windows
     if platform.system() == "Windows":
+        logger.info("Employing CPU exclusivity with Piper TTS")
         cuda = False
 
     if not download_dir:
@@ -1107,7 +1109,7 @@ def accelerate_segments(
 
 
 def se_process_audio_segments(
-    source_seg, tone_color_converter, remove_previous_processed=True
+    source_seg, tone_color_converter, device, remove_previous_processed=True
 ):
     # list wav seg
     source_audio_segs = glob.glob(f"{source_seg}/*.wav")
@@ -1280,6 +1282,7 @@ def toneconverter_openvoice(
         url=checkpoint_url, path=model_path_openvoice
     )
 
+    device = os.environ.get("SONITR_DEVICE")
     tone_color_converter = ToneColorConverter(config_path, device=device)
     tone_color_converter.load_ckpt(checkpoint_path)
 
@@ -1290,9 +1293,9 @@ def toneconverter_openvoice(
         path_source_segments, path_target_segments, valid_speakers
     ):
         # source_se_path = os.path.join(source_seg, 'se.pth')
-        source_se = se_process_audio_segments(source_seg, tone_color_converter)
+        source_se = se_process_audio_segments(source_seg, tone_color_converter, device)
         # target_se_path = os.path.join(target_seg, 'se.pth')
-        target_se = se_process_audio_segments(target_seg, tone_color_converter)
+        target_se = se_process_audio_segments(target_seg, tone_color_converter, device)
 
         # Iterate throw segments
         encode_message = "@MyShell"
@@ -1361,6 +1364,8 @@ def toneconverter_freevc(
     )
 
     logger.info("FreeVC loading model...")
+    device_id = os.environ.get("SONITR_DEVICE")
+    device = None if device_id == "cpu" else device_id
     try:
         from TTS.api import TTS
         tts = TTS(