1
1
import tempfile
2
2
from src .bark .npz_tools import save_npz
3
3
from src .bark .FullGeneration import FullGeneration
4
- from models .bark_voice_cloning_hubert_quantizer .hubert .hubert_manager import HuBERTManager
5
- from models .bark_voice_cloning_hubert_quantizer .hubert .pre_kmeans_hubert import CustomHubert
6
- from models .bark_voice_cloning_hubert_quantizer .hubert .customtokenizer import CustomTokenizer
4
+ from models .bark_voice_cloning_hubert_quantizer .hubert .hubert_manager import (
5
+ HuBERTManager ,
6
+ )
7
+ from models .bark_voice_cloning_hubert_quantizer .hubert .pre_kmeans_hubert import (
8
+ CustomHubert ,
9
+ )
10
+ from models .bark_voice_cloning_hubert_quantizer .hubert .customtokenizer import (
11
+ CustomTokenizer ,
12
+ )
7
13
import torchaudio
8
14
import torch
9
15
from encodec .utils import convert_audio
@@ -46,7 +52,9 @@ def _load_tokenizer():
46
52
tokenizer_path = HuBERTManager .make_sure_tokenizer_installed ()
47
53
global tokenizer
48
54
if tokenizer is None :
49
- tokenizer = CustomTokenizer .load_from_checkpoint ('data/models/hubert/tokenizer.pth' )
55
+ tokenizer = CustomTokenizer .load_from_checkpoint (
56
+ "data/models/hubert/tokenizer.pth"
57
+ )
50
58
tokenizer .load_state_dict (torch .load (tokenizer_path ))
51
59
return tokenizer
52
60
@@ -76,7 +84,7 @@ def get_prompts(path_to_wav: str, use_gpu: bool):
76
84
77
85
78
86
def get_encodec_prompts (path_to_wav : str , use_gpu = True ):
79
- device = ' cuda' if use_gpu else ' cpu'
87
+ device = " cuda" if use_gpu else " cpu"
80
88
model : EncodecModel = load_codec_model (use_gpu = use_gpu )
81
89
wav , sr = torchaudio .load (path_to_wav )
82
90
wav = convert_audio (wav , sr , model .sample_rate , model .channels )
@@ -87,37 +95,41 @@ def get_encodec_prompts(path_to_wav: str, use_gpu=True):
87
95
with torch .no_grad ():
88
96
encoded_frames = model .encode (wav )
89
97
90
- fine_prompt : np .ndarray = torch .cat (
91
- [encoded [0 ] for encoded in encoded_frames ], dim = - 1 ).squeeze ().cpu ().numpy ()
98
+ fine_prompt : np .ndarray = (
99
+ torch .cat ([encoded [0 ] for encoded in encoded_frames ], dim = - 1 )
100
+ .squeeze ()
101
+ .cpu ()
102
+ .numpy ()
103
+ )
92
104
coarse_prompt = fine_prompt [:2 , :]
93
105
return fine_prompt , coarse_prompt
94
106
95
107
96
108
def save_cloned_voice (
97
109
full_generation : FullGeneration ,
98
110
):
99
- voice_name = f' test_clone_voice{ str (np .random .randint (100000 ))} '
100
- filename = f' voices/{ voice_name } .npz'
111
+ voice_name = f" test_clone_voice{ str (np .random .randint (100000 ))} "
112
+ filename = f" voices/{ voice_name } .npz"
101
113
save_npz (filename , full_generation )
102
114
return filename
103
115
104
116
105
117
def tab_voice_clone_demo ():
106
118
with gr .Tab ("Bark Voice Clone Demo" ):
107
- gr .Markdown ("""
119
+ gr .Markdown (
120
+ """
108
121
Unethical use of this technology is prohibited.
109
122
This demo is based on https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer repository.
110
- """ )
123
+ """
124
+ )
111
125
112
126
# TODO: try with ffmpeg (except mp3)
113
127
# file_input = gr.Audio(label="Input Audio", type="numpy", source="upload", interactive=True)
114
- file_input = gr .File (label = "Input Audio" , file_types = [
115
- ".wav" ], interactive = True )
128
+ file_input = gr .File (label = "Input Audio" , file_types = [".wav" ], interactive = True )
116
129
117
130
use_gpu_checkbox = gr .Checkbox (label = "Use GPU" , value = True )
118
131
119
- generate_voice_button = gr .Button (
120
- value = "Generate Voice" , variant = "primary" )
132
+ generate_voice_button = gr .Button (value = "Generate Voice" , variant = "primary" )
121
133
122
134
def generate_voice (wav_file_obj : tempfile ._TemporaryFileWrapper , use_gpu : bool ):
123
135
if wav_file_obj is None :
@@ -128,8 +140,11 @@ def generate_voice(wav_file_obj: tempfile._TemporaryFileWrapper, use_gpu: bool):
128
140
filename = save_cloned_voice (full_generation )
129
141
return f"Saved: { filename } "
130
142
131
- output = gr .Label (
132
- "Output will appear here after input" , type = "auto" )
143
+ output = gr .Label ("Output will appear here after input" , type = "auto" )
133
144
134
- generate_voice_button .click (fn = generate_voice , inputs = [
135
- file_input , use_gpu_checkbox ], outputs = output , preprocess = True )
145
+ generate_voice_button .click (
146
+ fn = generate_voice ,
147
+ inputs = [file_input , use_gpu_checkbox ],
148
+ outputs = output ,
149
+ preprocess = True ,
150
+ )
0 commit comments