Skip to content

Commit

Permalink
feat: Set parameter --cpu_mode to use the CPU mode R3gm#39
Browse files Browse the repository at this point in the history
  • Loading branch information
R3gm committed Apr 21, 2024
1 parent d916723 commit bf199f2
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 25 deletions.
25 changes: 19 additions & 6 deletions app_rvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,16 @@ def check_openai_api_key():


class SoniTranslate(SoniTrCache):
def __init__(self, dev=False):
def __init__(self, cpu_mode=False):
super().__init__()
self.device = "cuda" if torch.cuda.is_available() else "cpu"
if cpu_mode:
os.environ["SONITR_DEVICE"] = "cpu"
else:
os.environ["SONITR_DEVICE"] = (
"cuda" if torch.cuda.is_available() else "cpu"
)

self.device = os.environ.get("SONITR_DEVICE")
self.result_diarize = None
self.align_language = None
self.result_source_lang = None
Expand All @@ -282,7 +289,7 @@ def __init__(self, dev=False):

os.environ["VOICES_MODELS"] = "DISABLE"
os.environ["VOICES_MODELS_WORKERS"] = "1"
self.vci = ClassVoices()
self.vci = ClassVoices(only_cpu=cpu_mode)

self.tts_voices = self.get_tts_voice_list()

Expand Down Expand Up @@ -1597,7 +1604,7 @@ def get_subs_path(type_subs):
)
whisper_model_default = (
"large-v3"
if torch.cuda.is_available()
if SoniTr.device == "cuda"
else "medium"
)

Expand All @@ -1610,7 +1617,7 @@ def get_subs_path(type_subs):
)
com_t_opt, com_t_default = (
[COMPUTE_TYPE_GPU, "float16"]
if torch.cuda.is_available()
if SoniTr.device == "cuda"
else [COMPUTE_TYPE_CPU, "float32"]
)
compute_type = gr.Dropdown(
Expand Down Expand Up @@ -2555,6 +2562,12 @@ def create_parser():
default="english",
help=" Select the language of the interface: english, spanish",
)
parser.add_argument(
"--cpu_mode",
action="store_true",
default=False,
help="Enable CPU mode to run the program without utilizing GPU acceleration.",
)
return parser


Expand All @@ -2576,7 +2589,7 @@ def create_parser():

models_path, index_path = upload_model_list()

SoniTr = SoniTranslate()
SoniTr = SoniTranslate(cpu_mode=args.cpu_mode)

lg_conf = get_language_config(language_data, language=args.language)

Expand Down
20 changes: 15 additions & 5 deletions soni_translate/mdx_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,8 @@ class MDX:
DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR

DEFAULT_PROCESSOR = 0 if torch.cuda.is_available() else -1

def __init__(
self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR
self, model_path: str, params: MDXModel, processor=0
):
# Set the device and the provider (CPU or CUDA)
self.device = (
Expand Down Expand Up @@ -356,14 +354,17 @@ def run_mdx(
denoise=False,
keep_orig=True,
m_threads=2,
device_base="cuda",
):
if torch.cuda.is_available():
if device_base == "cuda":
device = torch.device("cuda:0")
processor_num = 0
device_properties = torch.cuda.get_device_properties(device)
vram_gb = device_properties.total_memory / 1024**3
m_threads = 1 if vram_gb < 8 else 2
else:
device = torch.device("cpu")
processor_num = -1
m_threads = 1

model_hash = MDX.get_hash(model_path)
Expand All @@ -377,7 +378,7 @@ def run_mdx(
compensation=mp["compensate"],
)

mdx_sess = MDX(model_path, model)
mdx_sess = MDX(model_path, model, processor=processor_num)
wave, sr = librosa.load(filename, mono=False, sr=44100)
# normalizing input wave gives better output
peak = max(np.max(wave), abs(np.min(wave)))
Expand Down Expand Up @@ -478,6 +479,11 @@ def process_uvr_task(
only_voiceless: bool = False,
remove_files_output_dir: bool = False,
):
if os.environ.get("SONITR_DEVICE") == "cpu":
device_base = "cpu"
else:
device_base = "cuda" if torch.cuda.is_available() else "cpu"

if remove_files_output_dir:
remove_directory_contents(output_dir)

Expand All @@ -501,6 +507,7 @@ def process_uvr_task(
denoise=False,
keep_orig=True,
exclude_inversion=True,
device_base=device_base,
)

logger.info("Vocal Track Isolation and Voiceless Track Separation...")
Expand All @@ -511,6 +518,7 @@ def process_uvr_task(
orig_song_path,
denoise=True,
keep_orig=True,
device_base=device_base,
)

if main_vocals:
Expand All @@ -523,6 +531,7 @@ def process_uvr_task(
suffix="Backup",
invert_suffix="Main",
denoise=True,
device_base=device_base,
)
else:
backup_vocals_path, main_vocals_path = None, vocals_path
Expand All @@ -537,6 +546,7 @@ def process_uvr_task(
invert_suffix="DeReverb",
exclude_main=True,
denoise=True,
device_base=device_base,
)
else:
vocals_dereverb_path = main_vocals_path
Expand Down
10 changes: 4 additions & 6 deletions soni_translate/speech_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@

WHISPER_MODELS_PATH = './WHISPER_MODELS'

device = "cuda" if torch.cuda.is_available() else "cpu"


def find_whisper_models():
path = WHISPER_MODELS_PATH
Expand Down Expand Up @@ -146,7 +144,7 @@ def transcribe_speech(

model = whisperx.load_model(
asr_model,
device,
os.environ.get("SONITR_DEVICE"),
compute_type=compute_type,
language=SOURCE_LANGUAGE,
asr_options=asr_options,
Expand Down Expand Up @@ -218,7 +216,7 @@ def align_speech(audio, result):

model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device,
device=os.environ.get("SONITR_DEVICE"),
model_name=None
if result["language"] in DAMHF.keys()
else EXTRA_ALIGN[result["language"]],
Expand All @@ -228,7 +226,7 @@ def align_speech(audio, result):
model_a,
metadata,
audio,
device,
os.environ.get("SONITR_DEVICE"),
return_char_alignments=True,
)
del model_a
Expand Down Expand Up @@ -286,7 +284,7 @@ def diarize_speech(
diarize_model = whisperx.DiarizationPipeline(
model_name=model_name,
use_auth_token=YOUR_HF_TOKEN,
device=device,
device=os.environ.get("SONITR_DEVICE"),
)

except Exception as error:
Expand Down
21 changes: 13 additions & 8 deletions soni_translate/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@
import logging
from .logging_setup import logger

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype_env = torch.float16 if torch.cuda.is_available() else torch.float32


class TTS_OperationError(Exception):
def __init__(self, message="The operation did not complete successfully."):
Expand Down Expand Up @@ -197,6 +194,9 @@ def segments_bark_tts(
from transformers import AutoProcessor, BarkModel
from optimum.bettertransformer import BetterTransformer

device = os.environ.get("SONITR_DEVICE")
torch_dtype_env = torch.float16 if device == "cuda" else torch.float32

# load model bark
model = BarkModel.from_pretrained(
model_id_bark, torch_dtype=torch_dtype_env
Expand All @@ -205,7 +205,7 @@ def segments_bark_tts(
processor = AutoProcessor.from_pretrained(
model_id_bark, return_tensors="pt"
) # , padding=True
if torch.cuda.is_available():
if device == "cuda":
# convert to bettertransformer
model = BetterTransformer.transform(model, keep_original_model=False)
# enable CPU offload
Expand Down Expand Up @@ -626,6 +626,7 @@ def segments_coqui_tts(
)

# Init TTS
device = os.environ.get("SONITR_DEVICE")
model = TTS(model_id_coqui).to(device)
sampling_rate = 24000

Expand Down Expand Up @@ -729,7 +730,7 @@ def load_piper_model(
try:
import onnxruntime as rt

if rt.get_device() == "GPU" and torch.cuda.is_available():
if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda":
logger.debug("onnxruntime device > GPU")
cuda = True
else:
Expand All @@ -742,6 +743,7 @@ def load_piper_model(

# Disable CUDA in Windows
if platform.system() == "Windows":
logger.info("Employing CPU exclusivity with Piper TTS")
cuda = False

if not download_dir:
Expand Down Expand Up @@ -1107,7 +1109,7 @@ def accelerate_segments(


def se_process_audio_segments(
source_seg, tone_color_converter, remove_previous_processed=True
source_seg, tone_color_converter, device, remove_previous_processed=True
):
# list wav seg
source_audio_segs = glob.glob(f"{source_seg}/*.wav")
Expand Down Expand Up @@ -1280,6 +1282,7 @@ def toneconverter_openvoice(
url=checkpoint_url, path=model_path_openvoice
)

device = os.environ.get("SONITR_DEVICE")
tone_color_converter = ToneColorConverter(config_path, device=device)
tone_color_converter.load_ckpt(checkpoint_path)

Expand All @@ -1290,9 +1293,9 @@ def toneconverter_openvoice(
path_source_segments, path_target_segments, valid_speakers
):
# source_se_path = os.path.join(source_seg, 'se.pth')
source_se = se_process_audio_segments(source_seg, tone_color_converter)
source_se = se_process_audio_segments(source_seg, tone_color_converter, device)
# target_se_path = os.path.join(target_seg, 'se.pth')
target_se = se_process_audio_segments(target_seg, tone_color_converter)
target_se = se_process_audio_segments(target_seg, tone_color_converter, device)

# Iterate throw segments
encode_message = "@MyShell"
Expand Down Expand Up @@ -1361,6 +1364,8 @@ def toneconverter_freevc(
)

logger.info("FreeVC loading model...")
device_id = os.environ.get("SONITR_DEVICE")
device = None if device_id == "cpu" else device_id
try:
from TTS.api import TTS
tts = TTS(
Expand Down

0 comments on commit bf199f2

Please sign in to comment.