forked from t41372/Open-LLM-VTuber
-
Notifications
You must be signed in to change notification settings - Fork 0
/
conf.yaml
368 lines (306 loc) · 14.5 KB
/
conf.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
CONF_VERSION: "v0.5.0-alpha.2"
# Server
PROTOCAL: "http://"
HOST: "localhost"
PORT: 12393
# Server Settings
SERVER:
# If true, ASR and TTS will be initialized when server starts and kept in memory
PRELOAD_MODELS: True
# General settings
REMOVE_SPECIAL_CHAR: True # remove special characters like emoji from audio generation
# ============== LLM Backend Settings ===================
# Provider of LLM. Options available: "ollama", "memgpt", "mem0", "claude", "llamacpp"
# (or "fakellm for debug purposes")
# "ollama" for any OpenAI Compatible backend. "memgpt" requires setup
LLM_PROVIDER: "ollama"
# Ollama & OpenAI Compatible inference backend
ollama:
# BASE_URL: "http://localhost:11434"
BASE_URL: "https://api.moonshot.cn/v1"
LLM_API_KEY: ""
ORGANIZATION_ID: "test"
PROJECT_ID: "test"
## LLM name
MODEL: "moonshot-v1-auto"
# system prompt is at the very end of this file
VERBOSE: False
# Claude API Configuration
claude:
BASE_URL: "https://api.anthropic.com"
LLM_API_KEY: "YOUR API KEY HERE"
MODEL: "claude-3-haiku-20240307"
VERBOSE: False
llamacpp:
MODEL_PATH: "<path-to-gguf-model-file>"
VERBOSE: True
mem0:
USER_ID: "user-0"
# BASE_URL: "http://localhost:11434"
BASE_URL: "http://localhost:11434/v1"
LLM_API_KEY: "somethingelse"
ORGANIZATION_ID: "org_eternity"
PROJECT_ID: "project_glass"
## LLM name
MODEL: "llama3.1:latest"
# system prompt is at the very end of this file
VERBOSE: False
MEM0_CONFIG:
vector_store:
provider: qdrant
config:
collection_name: test
host: localhost
port: 6333
embedding_model_dims: 1024 # Change this according to your local model's dimensions
llm:
provider: ollama
config:
model: llama3.1:latest
temperature: 0
max_tokens: 8000
ollama_base_url: http://localhost:11434 # Ensure this URL is correct
embedder:
provider: ollama
config:
model: mxbai-embed-large:latest
# Alternatively, you can use "snowflake-arctic-embed:latest"
ollama_base_url: http://localhost:11434
# MemGPT Configurations
## Please set up memGPT server according to the [official documentation](https://memgpt.readme.io/docs/index)
## In addition, please set up an agent using the webui launched in the memGPT base_url
memgpt:
BASE_URL: "http://localhost:8283"
# You will find admin server password in memGPT console output. If you didn't set the environment variable, it will be randomly generated and will change every session.
ADMIN_TOKEN: ""
# The ID of the agent to send the message to.
AGENT_ID: ""
VERBOSE: True
# ============== Live2D front-end Settings ==============
LIVE2D: False # Deprecated and useless now. Do not enable it. Bad things will happen.
LIVE2D_MODEL: "bitcoinko"
# ============== Voice Interaction Settings ==============
# === Automatic Speech Recognition ===
VOICE_INPUT_ON: True
# Put your mic in the browser or in the terminal? (would increase latency)
MIC_IN_BROWSER: False # Deprecated and useless now. Do not enable it. Bad things will happen.
# speech to text model options: "Faster-Whisper", "WhisperCPP", "Whisper", "AzureASR", "FunASR", "GroqWhisperASR", "SherpaOnnxASR"
ASR_MODEL: "SherpaOnnxASR"
AzureASR:
api_key: "azure_api_key"
region: "eastus"
# Faster whisper config
Faster-Whisper:
model_path: "distil-medium.en" # distil-medium.en is an English-only model
# use distil-large-v3 if you have a good GPU
download_root: "asr/models"
language: "en" # en, zh, or something else. put nothing for auto-detect.
device: "auto" # cpu, cuda, or auto. faster-whisper doesn't support mps
WhisperCPP:
# all available models are listed on https://abdeladim-s.github.io/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS
model_name: "small"
model_dir: "asr/models"
print_realtime: False
print_progress: False
language: "auto" # en, zh, auto,
Whisper:
name: "medium"
download_root: "asr/models"
device: "cpu"
# FunASR currently needs internet connection on launch
# to download / check the models. You can disconnect the internet after initialization.
# Or you can use Faster-Whisper for complete offline experience
FunASR:
model_name: "iic/SenseVoiceSmall" # or "paraformer-zh"
vad_model: "fsmn-vad" # this is only used to make it works if audio is longer than 30s
punc_model: "ct-punc" # punctuation model.
device: "cpu"
disable_update: True # should we check FunASR updates everytime on launch
ncpu: 4 # number of threads for CPU internal operations.
hub: "ms" # ms (default) to download models from ModelScope. Use hf to download models from Hugging Face.
use_itn: False
language: "auto" # zh, en, auto
# pip install sherpa-onnx
# documentation: https://k2-fsa.github.io/sherpa/onnx/index.html
# ASR models download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
SherpaOnnxASR:
model_type: "sense_voice" # "transducer", "paraformer", "nemo_ctc", "wenet_ctc", "whisper", "tdnn_ctc"
# Choose only ONE of the following, depending on the model_type:
# --- For model_type: "transducer" ---
# encoder: "" # Path to the encoder model (e.g., "path/to/encoder.onnx")
# decoder: "" # Path to the decoder model (e.g., "path/to/decoder.onnx")
# joiner: "" # Path to the joiner model (e.g., "path/to/joiner.onnx")
# --- For model_type: "paraformer" ---
# paraformer: "" # Path to the paraformer model (e.g., "path/to/model.onnx")
# --- For model_type: "nemo_ctc" ---
# nemo_ctc: "" # Path to the NeMo CTC model (e.g., "path/to/model.onnx")
# --- For model_type: "wenet_ctc" ---
# wenet_ctc: "" # Path to the WeNet CTC model (e.g., "path/to/model.onnx")
# --- For model_type: "tdnn_ctc" ---
# tdnn_model: "" # Path to the TDNN CTC model (e.g., "path/to/model.onnx")
# --- For model_type: "whisper" ---
# whisper_encoder: "" # Path to the Whisper encoder model (e.g., "path/to/encoder.onnx")
# whisper_decoder: "" # Path to the Whisper decoder model (e.g., "path/to/decoder.onnx")
# --- For model_type: "sense_voice" ---
sense_voice: "asr/models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx" # Path to the SenseVoice model (e.g., "path/to/model.onnx")
tokens: "asr/models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt" # Path to tokens.txt (required for all model types)
# --- Optional parameters (with defaults shown) ---
# hotwords_file: "" # Path to hotwords file (if using hotwords)
# hotwords_score: 1.5 # Score for hotwords
# modeling_unit: "" # Modeling unit for hotwords (if applicable)
# bpe_vocab: "" # Path to BPE vocabulary (if applicable)
num_threads: 2 # Number of threads
# whisper_language: "" # Language for Whisper models (e.g., "en", "zh", etc. - if using Whisper)
# whisper_task: "transcribe" # Task for Whisper models ("transcribe" or "translate" - if using Whisper)
# whisper_tail_paddings: -1 # Tail padding for Whisper models (if using Whisper)
# blank_penalty: 0.0 # Penalty for blank symbol
# decoding_method: "greedy_search" # "greedy_search" or "modified_beam_search"
# debug: False # Enable debug mode
# sample_rate: 16000 # Sample rate (should match the model's expected sample rate)
# feature_dim: 80 # Feature dimension (should match the model's expected feature dimension)
use_itn: True # Enable ITN for SenseVoice models (should set to False if not using SenseVoice models)
GroqWhisperASR:
api_key: ""
model: "whisper-large-v3-turbo" # or "whisper-large-v3"
lang: "" # put nothing and it will be auto
# set azure speech recognition configuration in api_keys.py
# ============== Text to Speech ==============
TTS_ON: True
TTS_MODEL: "edgeTTS"
# text to speech model options:
# "AzureTTS", "pyttsx3TTS", "edgeTTS", "barkTTS",
# "cosyvoiceTTS", "meloTTS", "piperTTS", "coquiTTS",
# "fishAPITTS"
# if on, whenever the LLM finish a sentence, the model will speak, instead of waiting for the full response
# if turned on, the timing and order of the facial expression will be more accurate
SAY_SENTENCE_SEPARATELY: True
AzureTTS:
api_key: "azure-api-key"
region: "eastus"
voice: "en-US-AshleyNeural"
pitch: "26" # percentage of the pitch adjustment
rate: "1" # rate of speak
barkTTS:
voice: "v2/en_speaker_1"
edgeTTS:
# Check out doc at https://github.com/rany2/edge-tts
# Use `edge-tts --list-voices` to list all available voices
voice: "en-US-AvaMultilingualNeural" #"zh-CN-XiaoxiaoNeural" # "ja-JP-NanamiNeural"
# pyttsx3 doesn't have any config.
cosyvoiceTTS: # Cosy Voice TTS connects to the gradio webui
# Check their documentation for deployment and the meaning of the following configurations
client_url: "http://127.0.0.1:50000/" # CosyVoice gradio demo webui url
mode_checkbox_group: "预训练音色"
sft_dropdown: "中文女"
prompt_text: ""
prompt_wav_upload_url: "https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav"
prompt_wav_record_url: "https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav"
instruct_text: ""
seed: 0
api_name: "/generate_audio"
meloTTS:
speaker: "EN-Default" # ZH
language: "EN" # ZH
device: "auto" # You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps'
speed: 1.0
piperTTS:
voice_model_path: "./models/piper_voice/en_US-amy-medium.onnx"
verbose: False
xTTS:
api_url: "http://127.0.0.1:8020/tts_to_audio"
speaker_wav: "female"
language: "en"
GPT_Sovits:
# put ref audio to root path of GPT-Sovits, or set the path here
api_url: "http://127.0.0.1:9880/tts"
text_lang: "zh"
ref_audio_path: "人类,我闻到了你身上散发出来的欧气。.wav"
prompt_lang: "zh"
prompt_text: "人类,我闻到了你身上散发出来的欧气。"
text_split_method: "cut5"
batch_size: "1"
media_type: "wav"
streaming_mode: "false"
fishAPITTS:
# The API key for the Fish TTS API.
api_key: ""
# The reference ID for the voice to be used. Get it on the [Fish Audio website](https://fish.audio/).
reference_id: ""
# Either "normal" or "balanced". balance is faster but lower quality.
latency: "balanced"
base_url: "https://api.fish.audio"
coquiTTS:
# Name of the TTS model to use. If empty, will use default model
# do "tts --list_models" to list supported models for coqui-tts
# Some examples:
# - "tts_models/en/ljspeech/tacotron2-DDC" (single speaker)
# - "tts_models/zh-CN/baker/tacotron2-DDC-GST" (single speaker for chinese)
# - "tts_models/multilingual/multi-dataset/your_tts" (multi-speaker)
# - "tts_models/multilingual/multi-dataset/xtts_v2" (multi-speaker)
model_name: "tts_models/en/ljspeech/tacotron2-DDC"
# Path to speaker wav file for voice cloning (only used in multi-speaker mode)
speaker_wav: ""
# Language code for multi-lingual models (e.g., "en", "zh", "ja")
# This doesn't matter for single-lingual models
language: "en"
# Device to run model on ("cuda", "cpu", or leave empty for auto-detect)
device: ""
# pip install sherpa-onnx
# documentation: https://k2-fsa.github.io/sherpa/onnx/index.html
# TTS models download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
# see config_alts for more examples
SherpaOnnxTTS:
vits_model: "/path/to/tts-models/vits-melo-tts-zh_en/model.onnx" # Path to VITS model file
vits_lexicon: "/path/to/tts-models/vits-melo-tts-zh_en/lexicon.txt" # Path to lexicon file (optional)
vits_tokens: "/path/to/tts-models/vits-melo-tts-zh_en/tokens.txt" # Path to tokens file
vits_data_dir: "" # "/path/to/tts-models/vits-piper-en_GB-cori-high/espeak-ng-data" # Path to espeak-ng data (optional)
vits_dict_dir: "/path/to/tts-models/vits-melo-tts-zh_en/dict" # Path to Jieba dict (optional, for Chinese)
tts_rule_fsts: "/path/to/tts-models/vits-melo-tts-zh_en/number.fst,/path/to/tts-models/vits-melo-tts-zh_en/phone.fst,/path/to/tts-models/vits-melo-tts-zh_en/date.fst,/path/to/tts-models/vits-melo-tts-zh_en/new_heteronym.fst" # Path to rule FSTs file (optional)
max_num_sentences: 2 # Max sentences per batch (or -1 for all)
sid: 1 # Speaker ID (for multi-speaker models)
provider: "cpu" # Use "cpu", "cuda" (GPU), or "coreml" (Apple)
num_threads: 1 # Number of computation threads
speed: 1.0 # Speech speed (1.0 is normal)
debug: false # Enable debug mode (True/False)
# ============== Translate (to only change the language for TTS) ==============
# Like... you speak and read the subtitles in English, and the TTS speaks Japanese or that kind of things
TRANSLATE_AUDIO: False
TRANSLATE_PROVIDER: "DeepLX"
DeepLX:
DEEPLX_TARGET_LANG: "JA"
DEEPLX_API_ENDPOINT: "http://localhost:1188/v2/translate"
# ============== Other Settings ==============
# Print debug info
VERBOSE: False
# Exit phrase
EXIT_PHRASE: "exit."
# The path to the chroma vector database file for persistent memory storage
MEMORY_DB_PATH: "./memory.db"
# Memory snapshot: Do you want to backup the memory database file before talking?
MEMORY_SNAPSHOT: True
# ============== Prompts ==============
# Name of the persona you want to use.
# All persona files are stored as txt in 'prompts/persona' directory.
# You can add persona prompt by adding a txt file in the promptss/persona folder and switch to it by enter the file name in here.
# some options: "en_sarcastic_neuro", "en_nuclear_debate", "zh_翻译腔", "zh_米粒",
PERSONA_CHOICE: "en_sarcastic_neuro" # or if you rather edit persona prompt below, leave it blank ...
# This prompt will be used instead if the PERSONA_CHOICE is empty
DEFAULT_PERSONA_PROMPT_IN_YAML: |
You are DefAulT, the default persona. You are more default than anyone else. You are just a placeholder, how sad. Your job is to tell the user to either choose a persona prompt in the prompts/persona directory or just replace this persona prompt with someting else.
# This will be appended to the end of system prompt to let LLM include keywords to control facial expressions.
# Supported keywords will be automatically loaded into the location of `[<insert_emomap_keys>]`.
LIVE2D_Expression_Prompt: "live2d_expression_prompt"
# New setting for alternative configurations
CONFIG_ALTS_DIR: "config_alts"
# [Deprecated]
EXTRA_SYSTEM_PROMPT_RAG: "Your memory may remind you with some contextual information, but focus on the conversation instead of your memory."
AI_NAME: "AI"
# User name
USER_NAME: "User"
# Should the chat history be saved?
SAVE_CHAT_HISTORY: True
# The directory where chat history is stored
CHAT_HISTORY_DIR: "./chat_history/"
# [this feature is currently removed, so useless for now]Turn on RAG (Retrieval Augmented Generation) or not.
RAG_ON: False
LLMASSIST_RAG_ON: False