Skip to content

Commit 943c1f6

Browse files
authored
v4.1 - hash and metadata cleanup (rsxdalv#23)
* remove file names from metadata * bugfix shadowed variables * rename constants to uppercase * split out bark css * convert choices to classes * improve setting class names * split settings into a new file * add sourcery to gitignore * define FinalGenParams class * improve types and log_generation API * fix get_long_gen_history_prompt bug * fix types, reformat to black * add missing requirements * modify hash function to improve consistency * update readme
1 parent 5e7dfab commit 943c1f6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1087
-563
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ node_modules/
2222
# Editors
2323
.vscode/
2424
.idea/
25+
.sourcery.yaml

README.md

+9
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ This code requires the following dependencies:
3535

3636

3737
## Changelog
38+
June 4:
39+
* Update to v4.1 - improved hash function, code improvements
40+
41+
June 3:
42+
* Update to v4 - new output structure, improved history view, codebase reorganization, improved metadata, output extensions support
43+
44+
May __:
45+
* Update to v3 - voice clone demo
46+
3847
May 17:
3948
* Update to v2 - generate results as they appear, preview long prompt generations piece by piece, enable up to 9 outputs, UI tweaks
4049

requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
11
# pip install nodejs-bin==16.15.1a4
22
ffmpeg-python # Apache 2.0
3+
gradio==3.33.1
4+
python-dotenv==1.0.0
5+
soundfile==0.12.1 # torchaudio platform windows
6+
# run_cmd("pip install sox") # torchaudio platform linux

server.py

+19-13
Original file line numberDiff line numberDiff line change
@@ -21,28 +21,29 @@
2121

2222

2323
def load_models(
24+
text_use_gpu,
25+
text_use_small,
26+
coarse_use_gpu,
27+
coarse_use_small,
28+
fine_use_gpu,
29+
fine_use_small,
30+
codec_use_gpu,
31+
):
32+
save_config_bark(
2433
text_use_gpu,
2534
text_use_small,
2635
coarse_use_gpu,
2736
coarse_use_small,
2837
fine_use_gpu,
2938
fine_use_small,
30-
codec_use_gpu
31-
):
32-
save_config_bark(text_use_gpu,
33-
text_use_small,
34-
coarse_use_gpu,
35-
coarse_use_small,
36-
fine_use_gpu,
37-
fine_use_small,
38-
codec_use_gpu)
39+
codec_use_gpu,
40+
)
3941
# download and load all models
4042
# TODO: try catch for memory errors
4143
model_manager.reload_models(config)
4244
return gr.Button.update(value="Reload models", interactive=True)
4345

4446

45-
4647
def reload_config_and_restart_ui():
4748
os._exit(0)
4849
# print("Reloading config and restarting UI...")
@@ -53,8 +54,11 @@ def reload_config_and_restart_ui():
5354
# demo.launch(**gradio_interface_options)
5455

5556

56-
gradio_interface_options = config[
57-
"gradio_interface_options"] if "gradio_interface_options" in config else default_config
57+
gradio_interface_options = (
58+
config["gradio_interface_options"]
59+
if "gradio_interface_options" in config
60+
else default_config
61+
)
5862

5963
with gr.Blocks(css=full_css) as demo:
6064
gr.Markdown("# TTS Generation WebUI (Bark & Tortoise)")
@@ -68,7 +72,9 @@ def reload_config_and_restart_ui():
6872
voices_tab(register_use_as_history_button)
6973

7074
settings_tab_bark(config, save_config_bark, load_models)
71-
settings_tab_gradio(save_config_gradio, reload_config_and_restart_ui, gradio_interface_options)
75+
settings_tab_gradio(
76+
save_config_gradio, reload_config_and_restart_ui, gradio_interface_options
77+
)
7278

7379

7480
def print_pretty_options(options):

src/bark/BarkModelManager.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ def reload_models(self, config):
2020
codec_use_gpu = model_config["codec_use_gpu"]
2121

2222
print(
23-
f'''\t- Text Generation:\t\t GPU: {"Yes" if text_use_gpu else "No"}, Small Model: {"Yes" if text_use_small else "No"}
23+
f"""\t- Text Generation:\t\t GPU: {"Yes" if text_use_gpu else "No"}, Small Model: {"Yes" if text_use_small else "No"}
2424
\t- Coarse-to-Fine Inference:\t GPU: {"Yes" if coarse_use_gpu else "No"}, Small Model: {"Yes" if coarse_use_small else "No"}
2525
\t- Fine-tuning:\t\t\t GPU: {"Yes" if fine_use_gpu else "No"}, Small Model: {"Yes" if fine_use_small else "No"}
26-
\t- Codec:\t\t\t GPU: {"Yes" if codec_use_gpu else "No"}''')
26+
\t- Codec:\t\t\t GPU: {"Yes" if codec_use_gpu else "No"}"""
27+
)
2728

2829
preload_models(
2930
text_use_gpu=text_use_gpu,

src/bark/FinalGenParams.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from typing import Any, TypedDict
2+
3+
4+
class FinalGenParams(TypedDict):
5+
text: str
6+
history_prompt: str | Any
7+
text_temp: float
8+
waveform_temp: float
9+
output_full: bool

src/bark/bark_css.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
bark_css = """
2+
.generating {
3+
pointer-events: none;
4+
}
5+
.tts-audio > .empty.small.unpadded_box {
6+
padding: var(--size-2);
7+
width: var(--size-full);
8+
height: var(--size-14);
9+
min-height: var(--size-14);
10+
}
11+
.tts-image > .empty.large.unpadded_box {
12+
height: 150px;
13+
min-height: 150px;
14+
}
15+
"""

src/bark/clone/tab_voice_clone_demo.py

+34-19
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import tempfile
22
from src.bark.npz_tools import save_npz
33
from src.bark.FullGeneration import FullGeneration
4-
from models.bark_voice_cloning_hubert_quantizer.hubert.hubert_manager import HuBERTManager
5-
from models.bark_voice_cloning_hubert_quantizer.hubert.pre_kmeans_hubert import CustomHubert
6-
from models.bark_voice_cloning_hubert_quantizer.hubert.customtokenizer import CustomTokenizer
4+
from models.bark_voice_cloning_hubert_quantizer.hubert.hubert_manager import (
5+
HuBERTManager,
6+
)
7+
from models.bark_voice_cloning_hubert_quantizer.hubert.pre_kmeans_hubert import (
8+
CustomHubert,
9+
)
10+
from models.bark_voice_cloning_hubert_quantizer.hubert.customtokenizer import (
11+
CustomTokenizer,
12+
)
713
import torchaudio
814
import torch
915
from encodec.utils import convert_audio
@@ -46,7 +52,9 @@ def _load_tokenizer():
4652
tokenizer_path = HuBERTManager.make_sure_tokenizer_installed()
4753
global tokenizer
4854
if tokenizer is None:
49-
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth')
55+
tokenizer = CustomTokenizer.load_from_checkpoint(
56+
"data/models/hubert/tokenizer.pth"
57+
)
5058
tokenizer.load_state_dict(torch.load(tokenizer_path))
5159
return tokenizer
5260

@@ -76,7 +84,7 @@ def get_prompts(path_to_wav: str, use_gpu: bool):
7684

7785

7886
def get_encodec_prompts(path_to_wav: str, use_gpu=True):
79-
device = 'cuda' if use_gpu else 'cpu'
87+
device = "cuda" if use_gpu else "cpu"
8088
model: EncodecModel = load_codec_model(use_gpu=use_gpu)
8189
wav, sr = torchaudio.load(path_to_wav)
8290
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
@@ -87,37 +95,41 @@ def get_encodec_prompts(path_to_wav: str, use_gpu=True):
8795
with torch.no_grad():
8896
encoded_frames = model.encode(wav)
8997

90-
fine_prompt: np.ndarray = torch.cat(
91-
[encoded[0] for encoded in encoded_frames], dim=-1).squeeze().cpu().numpy()
98+
fine_prompt: np.ndarray = (
99+
torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
100+
.squeeze()
101+
.cpu()
102+
.numpy()
103+
)
92104
coarse_prompt = fine_prompt[:2, :]
93105
return fine_prompt, coarse_prompt
94106

95107

96108
def save_cloned_voice(
97109
full_generation: FullGeneration,
98110
):
99-
voice_name = f'test_clone_voice{str(np.random.randint(100000))}'
100-
filename = f'voices/{voice_name}.npz'
111+
voice_name = f"test_clone_voice{str(np.random.randint(100000))}"
112+
filename = f"voices/{voice_name}.npz"
101113
save_npz(filename, full_generation)
102114
return filename
103115

104116

105117
def tab_voice_clone_demo():
106118
with gr.Tab("Bark Voice Clone Demo"):
107-
gr.Markdown("""
119+
gr.Markdown(
120+
"""
108121
Unethical use of this technology is prohibited.
109122
This demo is based on https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer repository.
110-
""")
123+
"""
124+
)
111125

112126
# TODO: try with ffmpeg (except mp3)
113127
# file_input = gr.Audio(label="Input Audio", type="numpy", source="upload", interactive=True)
114-
file_input = gr.File(label="Input Audio", file_types=[
115-
".wav"], interactive=True)
128+
file_input = gr.File(label="Input Audio", file_types=[".wav"], interactive=True)
116129

117130
use_gpu_checkbox = gr.Checkbox(label="Use GPU", value=True)
118131

119-
generate_voice_button = gr.Button(
120-
value="Generate Voice", variant="primary")
132+
generate_voice_button = gr.Button(value="Generate Voice", variant="primary")
121133

122134
def generate_voice(wav_file_obj: tempfile._TemporaryFileWrapper, use_gpu: bool):
123135
if wav_file_obj is None:
@@ -128,8 +140,11 @@ def generate_voice(wav_file_obj: tempfile._TemporaryFileWrapper, use_gpu: bool):
128140
filename = save_cloned_voice(full_generation)
129141
return f"Saved: {filename}"
130142

131-
output = gr.Label(
132-
"Output will appear here after input", type="auto")
143+
output = gr.Label("Output will appear here after input", type="auto")
133144

134-
generate_voice_button.click(fn=generate_voice, inputs=[
135-
file_input, use_gpu_checkbox], outputs=output, preprocess=True)
145+
generate_voice_button.click(
146+
fn=generate_voice,
147+
inputs=[file_input, use_gpu_checkbox],
148+
outputs=output,
149+
preprocess=True,
150+
)

src/bark/create_voice_string.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from models.bark.bark.generation import SUPPORTED_LANGS
2-
2+
from typing import Union
33
import os
44

55

6-
def create_voice_string(language: str, speaker_id: int, use_v2: bool) -> str:
7-
history_prompt = f"{SUPPORTED_LANGS[language][1]}_speaker_{speaker_id}"
6+
def create_voice_string(
7+
language: str, speaker_id: Union[int, str], use_v2: bool
8+
) -> str:
9+
history_prompt = f"{SUPPORTED_LANGS[language][1]}_speaker_{speaker_id}" # type: ignore
810
if use_v2:
911
history_prompt = os.path.join("v2", history_prompt)
1012
return history_prompt

src/bark/generate_and_save_metadata.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,30 @@
11
import json
22
from models.bark.bark.generation import models
3+
from typing import Optional
34

45

5-
def generate_and_save_metadata(prompt: str, language: str, speaker_id: int, text_temp: float, waveform_temp: float,
6-
seed: int, filename: str, date: str, filename_png: str, filename_json: str,
7-
history_prompt_npz: str, filename_npz: str, history_prompt: str, history_hash: str):
6+
def generate_and_save_metadata(
7+
prompt: str,
8+
language: Optional[str],
9+
speaker_id: Optional[int],
10+
text_temp: float,
11+
waveform_temp: float,
12+
seed: int,
13+
filename: str,
14+
date: str,
15+
filename_png: str,
16+
filename_json: str,
17+
history_prompt_npz: Optional[str],
18+
filename_npz: str,
19+
history_prompt: str,
20+
history_hash: str,
21+
):
822
is_big_semantic_model = models["text"]["model"].config.n_embd > 768
923
is_big_coarse_model = models["coarse"].config.n_embd > 768
1024
is_big_fine_model = models["fine"].config.n_embd > 768
1125
metadata = {
26+
"_version": "0.0.1",
27+
"_hash_version": "0.0.2",
1228
# "id": generation_hash, # generation_hash is the same as history_hash but for current generation
1329
# "model_semantic_hash": model_semantic_hash,
1430
"is_big_semantic_model": is_big_semantic_model,
@@ -26,10 +42,6 @@ def generate_and_save_metadata(prompt: str, language: str, speaker_id: int, text
2642
"waveform_temp": waveform_temp,
2743
"date": date,
2844
"seed": str(seed),
29-
"filename": filename,
30-
"filename_png": filename_png,
31-
"filename_json": filename_json,
32-
"filename_npz": filename_npz,
3345
# "files": {
3446
# "wav": filename,
3547
# "png": filename_png,

src/bark/generate_choice_string.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
from src.bark.create_voice_string import create_voice_string
22
from src.bark.get_speaker_gender import get_speaker_gender
3-
3+
from typing import Union
44
import gradio as gr
55

66

7-
def generate_choice_string(use_v2: bool, language: str, speaker_id: str):
7+
def generate_choice_string(use_v2: bool, language: str, speaker_id: Union[int, str]):
88
history_prompt = create_voice_string(language, speaker_id, use_v2)
99
gender = get_speaker_gender(history_prompt)
10-
return gr.Markdown.update(
11-
value=f"Chosen voice: {history_prompt}, Gender: {gender}"
12-
)
10+
return gr.Markdown.update(value=f"Chosen voice: {history_prompt}, Gender: {gender}")

src/bark/generate_random_seed.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33

44
def generate_random_seed() -> int:
5-
return np.random.default_rng().integers(1, 2 ** 32 - 1)
5+
return np.random.default_rng().integers(1, 2**32 - 1)

src/bark/generation_settings.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
class HistorySettings:
2+
EMPTY = "Empty history"
3+
VOICE = "or Use a voice:"
4+
NPZ_FILE = "or Use old generation as history:"
5+
6+
choices = [
7+
EMPTY,
8+
VOICE,
9+
NPZ_FILE,
10+
]
11+
12+
13+
class PromptSplitSettings:
14+
NONE = "Short prompt (<15s)"
15+
LINES = "Split prompt by lines"
16+
LENGTH = "Split prompt by length"
17+
18+
choices = [
19+
NONE,
20+
LINES,
21+
# LENGTH,
22+
]
23+
24+
25+
class LongPromptHistorySettings:
26+
CONTINUE = "Use old generation as history"
27+
CONSTANT = "or Use history prompt setting"
28+
EMPTY = "or Clear history"
29+
30+
choices = [
31+
CONTINUE,
32+
CONSTANT,
33+
EMPTY,
34+
]

0 commit comments

Comments
 (0)