Skip to content

Commit

Permalink
Add genshin configs
Browse files Browse the repository at this point in the history
  • Loading branch information
w4123 committed Aug 20, 2022
1 parent 2e561ba commit 49ea0c7
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 7 deletions.
53 changes: 53 additions & 0 deletions configs/genshin.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"train": {
"log_interval": 200,
"eval_interval": 1000,
"seed": 777,
"epochs": 2000,
"learning_rate": 2e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 16,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"training_files":"filelists/genshin_cleaned_train.txt",
"validation_files":"filelists/genshin_cleaned_valid.txt",
"text_cleaners":["chinese_cleaners2"],
"max_wav_value": 32768.0,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 53,
"cleaned_text": true
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [8,8,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 256
}
}
16 changes: 11 additions & 5 deletions text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ def text_to_sequence(text, cleaner_names):
sequence = []

clean_text = _clean_text(text, cleaner_names)
for symbol in clean_text:
symbol_id = _symbol_to_id[symbol]
sequence += [symbol_id]
return sequence
return cleaned_text_to_sequence(clean_text)


def cleaned_text_to_sequence(cleaned_text):
Expand All @@ -32,7 +29,16 @@ def cleaned_text_to_sequence(cleaned_text):
Returns:
List of integers corresponding to the symbols in the text
'''
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
sequence = []
for symbol in cleaned_text.split(" "):
if symbol in _symbol_to_id:
sequence.append(_symbol_to_id[symbol])
else:
for s in symbol:
sequence.append(_symbol_to_id[s])
sequence.append(_symbol_to_id[" "])
if sequence[-1] == _symbol_to_id[" "]:
sequence = sequence[:-1]
return sequence


Expand Down
48 changes: 47 additions & 1 deletion text/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,36 @@
import re
from unidecode import unidecode
from phonemizer import phonemize

from pypinyin import pinyin, lazy_pinyin, load_phrases_dict, Style, load_single_dict
from pypinyin.style._utils import get_finals, get_initials
from pypinyin_dict.phrase_pinyin_data import cc_cedict
from pypinyin_dict.pinyin_data import kmandarin_8105
import jieba
kmandarin_8105.load()
cc_cedict.load()
PHRASE_LIST = [
"琴", "安柏", "丽莎", "凯亚", "芭芭拉", "迪卢克", "雷泽", "温迪", "可莉", "班尼特", "诺艾尔", "菲谢尔",
"砂糖", "莫娜", "迪奥娜", "阿贝多", "罗莎莉亚", "优菈", "魈", "北斗", "凝光", "香菱", "行秋", "重云",
"七七", "刻晴", "达达利亚", "钟离", "辛焱", "甘雨", "胡桃", "烟绯", "申鹤", "云堇", "夜兰", "神里绫华",
"神里", "绫华", "枫原万叶", "枫原", "万叶", "宵宫", "早柚", "雷电将军", "九条裟罗", "九条", "裟罗", "珊瑚宫心海",
"珊瑚宫", "心海", "托马", "荒泷", "一斗", "荒泷派", "五郎", "八重神子", "神子", "神里绫人", "绫人",
"久岐忍", "鹿野院平藏", "平藏", "蒙德", "璃月", "稻妻", "北风的王狼", "风魔龙", "特瓦林", "若陀龙王", "龙脊雪山",
"金苹果群岛", "渊下宫", "层岩巨渊", "奥赛尔", "七天神像", "钩钩果", "落落莓", "塞西莉亚花", "风车菊", "尘歌壶",
"提瓦特", "明冠山地", "风龙废墟", "明冠峡", "坠星山谷", "果酒湖", "望风山地", "坎瑞亚", "须弥", "枫丹", "纳塔",
"至冬", "丘丘人", "丘丘暴徒", "深渊法师", "深渊咏者", "盗宝团", "愚人众", "深渊教团", "骗骗花", "急冻树", "龙蜥",
"鸣神岛", "神无冢", "八酝岛", "海祇岛", "清籁岛", "鹤观", "绝云间", "群玉阁", "南十字", "死兆星", "木漏茶室", "神樱",
"鸣神大社", "天使的馈赠", "社奉行", "勘定奉行", "天领奉行", "夜叉", "风神", "岩神", "雷神", "风之神", "岩之神", "雷之神",
"风神瞳", "岩神瞳", "雷神瞳", "摩拉克斯", "契约之神", "雷电影", "雷电真", "八重宫司", "宫司大人", "巴巴托斯", "玉衡星",
"天权星", "璃月七星", "留云借风", "削月筑阳", "理水叠山", "请仙典仪"
]

for phrase in PHRASE_LIST:
jieba.add_word(phrase)

load_phrases_dict({"若陀": [["rě"], ["tuó"]], "平藏": [["píng"], ["zàng"]],
"派蒙": [["pài"], ["méng"]], "安柏": [["ān"], ["bó"]],
"一斗": [["yī"], ["dǒu"]]
})

# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
Expand Down Expand Up @@ -64,6 +93,23 @@ def collapse_whitespace(text):
def convert_to_ascii(text):
return unidecode(text)

def chinese_cleaners(text):
return " ".join(lazy_pinyin(jieba.cut(text), style=Style.TONE3, errors='ignore'))

def chinese_cleaners2(text):
return " ".join([
p
for phone in pinyin(text, style=Style.TONE3, v_to_u=True)
for p in [
get_initials(phone[0], strict=True),
get_finals(phone[0][:-1], strict=True) + phone[0][-1]
if phone[0][-1].isdigit()
else get_finals(phone[0], strict=True)
if phone[0][-1].isalnum()
else phone[0],
]
if len(p) != 0 and not p.isdigit()
])

def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
Expand Down
5 changes: 4 additions & 1 deletion text/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
'''
Defines the set of symbols used in text input to the model.
'''
_numbers = '0123456789'
_pad = '_'
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
_py = ['sh', 'uo1', 'i3', 'ai2', 'i4', 'en2', 'en4', 'zh', 'eng3', 'ing4', 'i1', 'ia4', 'uo3', 'en', 'u2', 'e3', 'i2', 'üan2', 'ong1', 'ü2', 'u4', 'iong4', 'ai4', 'uang1', 'ie3', 'uei1', 'an2', 'iang3', 'e4', 'üe4', 'an4', 'ian4', 'iou3', 'uei4', 'ei2', 'ua4', 'iou4', 'ch', 'u1', 'a1', 'iong1', 'ian3', 'ou1', 'ong4', 'ü4', 'ian1', 'iang4', 'uo4', 'ü3', 'eng2', 'e2', 'ou4', 'an', 'ao3', 'ua1', 'in3', 'ou2', 'ie4', 'eng1', 'ou3', 'an3', 'er2', 'ai1', 'ie2', 'ing3', 'iou2', 'o1', 'ong3', 'an1', 'in4', 'ang1', 'ing2', 'ao4', 'iao4', 'a4', 'ing1', 'a3', 'ong2', 'iao1', 'in1', 'en3', 'uan2', 'uai4', 'ian2', 'e1', 'uei2', 'ang4', 'uang4', 'eng4', 'uan3', 'ai', 'iang', 'üe2', 'iao3', 'ei3', 'iou1', 'üan4', 'uan4', 'ou', 'o2', 'ei4', 'ei', 'ia', 'u3', 'ia1', 'en1', 'uan1', 'in2', 'ing', 'ün2', 'ie1', 'uo2', 'iang1', 'ei1', 'ang2', 'iao2', 'üan3', 'a2', 'ao1', 'iou', 'uen1', 'iang2', 'ang3', 'ua3', 'uen2', 'ie', 'ai3', 'uo', 'iong2', 'uen4', 'uang3', 'o4', 'ang', 'uei3', 'üan1', 'uang', 'ua', 'ian', 'uang2', 'er3', 'eng', 'ü1', 'ao2', 'ün1', 'uan', 'üe1', 'uen3', 'ia3', 'er4', 'uai2', 'er', 'ua2', 'uai3', 'ao', 'uen', 'ün4', 'in', 'iong3', 'ong', 'ün3', 'ün', 'ia2', 'uai1', 'üe3', 'iao', 'o3', 'uai', 'ueng1', 'uei', 'ü', 'iong']

_zhpunc = '!,、。?—…“”《》:+()「」~;·・'

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + list(_numbers) + list(_zhpunc) + _py

# Special symbol ids
SPACE_ID = symbols.index(" ")

0 comments on commit 49ea0c7

Please sign in to comment.