Skip to content

Commit

Permalink
Improve: NLP performance
Browse files Browse the repository at this point in the history
  • Loading branch information
tsukumijima committed May 11, 2024
1 parent 2d6baa3 commit d9b4a92
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 127 deletions.
73 changes: 37 additions & 36 deletions style_bert_vits2/nlp/chinese/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,41 @@
from style_bert_vits2.nlp.symbols import PUNCTUATIONS


__REPLACE_MAP = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": "…",
"$": ".",
"“": "'",
"”": "'",
'"': "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
"—": "-",
"~": "-",
"~": "-",
"「": "'",
"」": "'",
}


def normalize_text(text: str) -> str:
numbers = re.findall(r"\d+(?:\.?\d+)?", text)
for number in numbers:
Expand All @@ -15,44 +50,10 @@ def normalize_text(text: str) -> str:

def replace_punctuation(text: str) -> str:

REPLACE_MAP = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": "…",
"$": ".",
"“": "'",
"”": "'",
'"': "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
"—": "-",
"~": "-",
"~": "-",
"「": "'",
"」": "'",
}

text = text.replace("嗯", "恩").replace("呣", "母")
pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP.keys()))
pattern = re.compile("|".join(re.escape(p) for p in __REPLACE_MAP.keys()))

replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
replaced_text = pattern.sub(lambda x: __REPLACE_MAP[x.group()], text)

replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(PUNCTUATIONS) + r"]+", "", replaced_text
Expand Down
21 changes: 11 additions & 10 deletions style_bert_vits2/nlp/japanese/g2p_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@
from style_bert_vits2.nlp.symbols import PUNCTUATIONS


# 子音の集合
__CONSONANTS = set(
[
consonant
for consonant, _ in MORA_KATA_TO_MORA_PHONEMES.values()
if consonant is not None
]
)


def g2kata_tone(norm_text: str) -> list[tuple[str, int]]:
"""
テキストからカタカナとアクセントのペアのリストを返す。
Expand Down Expand Up @@ -33,15 +43,6 @@ def phone_tone2kata_tone(phone_tone: list[tuple[str, int]]) -> list[tuple[str, i
カタカナと音高のリスト。
"""

# 子音の集合
CONSONANTS = set(
[
consonant
for consonant, _ in MORA_KATA_TO_MORA_PHONEMES.values()
if consonant is not None
]
)

phone_tone = phone_tone[1:] # 最初の("_", 0)を無視
phones = [phone for phone, _ in phone_tone]
tones = [tone for _, tone in phone_tone]
Expand All @@ -52,7 +53,7 @@ def phone_tone2kata_tone(phone_tone: list[tuple[str, int]]) -> list[tuple[str, i
if phone in PUNCTUATIONS:
result.append((phone, tone))
continue
if phone in CONSONANTS: # n以外の子音の場合
if phone in __CONSONANTS: # n以外の子音の場合
assert current_mora == "", f"Unexpected {phone} after {current_mora}"
assert tone == next_tone, f"Unexpected {phone} tone {tone} != {next_tone}"
current_mora = phone
Expand Down
163 changes: 82 additions & 81 deletions style_bert_vits2/nlp/japanese/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,81 @@
from style_bert_vits2.nlp.symbols import PUNCTUATIONS


# 記号類の正規化マップ
__REPLACE_MAP = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
".": ".",
"…": "...",
"···": "...",
"・・・": "...",
"·": ",",
"・": ",",
"、": ",",
"$": ".",
"“": "'",
"”": "'",
'"': "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
# NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換
"\u02d7": "\u002d", # ˗, Modifier Letter Minus Sign
"\u2010": "\u002d", # ‐, Hyphen,
# "\u2011": "\u002d", # ‑, Non-Breaking Hyphen, NFKC により \u2010 に変換される
"\u2012": "\u002d", # ‒, Figure Dash
"\u2013": "\u002d", # –, En Dash
"\u2014": "\u002d", # —, Em Dash
"\u2015": "\u002d", # ―, Horizontal Bar
"\u2043": "\u002d", # ⁃, Hyphen Bullet
"\u2212": "\u002d", # −, Minus Sign
"\u23af": "\u002d", # ⎯, Horizontal Line Extension
"\u23e4": "\u002d", # ⏤, Straightness
"\u2500": "\u002d", # ─, Box Drawings Light Horizontal
"\u2501": "\u002d", # ━, Box Drawings Heavy Horizontal
"\u2e3a": "\u002d", # ⸺, Two-Em Dash
"\u2e3b": "\u002d", # ⸻, Three-Em Dash
# "~": "-", # これは長音記号「ー」として扱うよう変更
# "~": "-", # これも長音記号「ー」として扱うよう変更
"「": "'",
"」": "'",
}
# 記号類の正規化パターン
__REPLACE_PATTERN = re.compile("|".join(re.escape(p) for p in __REPLACE_MAP.keys()))
# 句読点等の正規化パターン
__PUNCTUATION_CLEANUP_PATTERN = re.compile(
# ↓ ひらがな、カタカナ、漢字
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
# ↓ 半角アルファベット(大文字と小文字)
+ r"\u0041-\u005A\u0061-\u007A"
# ↓ 全角アルファベット(大文字と小文字)
+ r"\uFF21-\uFF3A\uFF41-\uFF5A"
# ↓ ギリシャ文字
+ r"\u0370-\u03FF\u1F00-\u1FFF"
# ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
+ "".join(PUNCTUATIONS) + r"]+",
)
# 数字・通貨記号の正規化パターン
__CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
__CURRENCY_PATTERN = re.compile(r"([$¥£€])([0-9.]*[0-9])")
__NUMBER_PATTERN = re.compile(r"[0-9]+(\.[0-9]+)?")
__NUMBER_WITH_SEPARATOR_PATTERN = re.compile("[0-9]{1,3}(,[0-9]{3})+")


def normalize_text(text: str) -> str:
"""
日本語のテキストを正規化する。
Expand Down Expand Up @@ -62,80 +137,11 @@ def replace_punctuation(text: str) -> str:
str: 正規化されたテキスト
"""

# 記号類の正規化変換マップ
REPLACE_MAP = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
".": ".",
"…": "...",
"···": "...",
"・・・": "...",
"·": ",",
"・": ",",
"、": ",",
"$": ".",
"“": "'",
"”": "'",
'"': "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
# NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換
"\u02d7": "\u002d", # ˗, Modifier Letter Minus Sign
"\u2010": "\u002d", # ‐, Hyphen,
# "\u2011": "\u002d", # ‑, Non-Breaking Hyphen, NFKC により \u2010 に変換される
"\u2012": "\u002d", # ‒, Figure Dash
"\u2013": "\u002d", # –, En Dash
"\u2014": "\u002d", # —, Em Dash
"\u2015": "\u002d", # ―, Horizontal Bar
"\u2043": "\u002d", # ⁃, Hyphen Bullet
"\u2212": "\u002d", # −, Minus Sign
"\u23af": "\u002d", # ⎯, Horizontal Line Extension
"\u23e4": "\u002d", # ⏤, Straightness
"\u2500": "\u002d", # ─, Box Drawings Light Horizontal
"\u2501": "\u002d", # ━, Box Drawings Heavy Horizontal
"\u2e3a": "\u002d", # ⸺, Two-Em Dash
"\u2e3b": "\u002d", # ⸻, Three-Em Dash
# "~": "-", # これは長音記号「ー」として扱うよう変更
# "~": "-", # これも長音記号「ー」として扱うよう変更
"「": "'",
"」": "'",
}

pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP.keys()))

# 句読点を辞書で置換
replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)

replaced_text = re.sub(
# ↓ ひらがな、カタカナ、漢字
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
# ↓ 半角アルファベット(大文字と小文字)
+ r"\u0041-\u005A\u0061-\u007A"
# ↓ 全角アルファベット(大文字と小文字)
+ r"\uFF21-\uFF3A\uFF41-\uFF5A"
# ↓ ギリシャ文字
+ r"\u0370-\u03FF\u1F00-\u1FFF"
# ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
+ "".join(PUNCTUATIONS) + r"]+",
# 上述以外の文字を削除
"",
replaced_text,
)
replaced_text = __REPLACE_PATTERN.sub(lambda x: __REPLACE_MAP[x.group()], text)

# 上述以外の文字を削除
replaced_text = __PUNCTUATION_CLEANUP_PATTERN.sub("", replaced_text)

return replaced_text

Expand All @@ -151,13 +157,8 @@ def __convert_numbers_to_words(text: str) -> str:
str: 変換されたテキスト
"""

NUMBER_WITH_SEPARATOR_PATTERN = re.compile("[0-9]{1,3}(,[0-9]{3})+")
CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
CURRENCY_PATTERN = re.compile(r"([$¥£€])([0-9.]*[0-9])")
NUMBER_PATTERN = re.compile(r"[0-9]+(\.[0-9]+)?")

res = NUMBER_WITH_SEPARATOR_PATTERN.sub(lambda m: m[0].replace(",", ""), text)
res = CURRENCY_PATTERN.sub(lambda m: m[2] + CURRENCY_MAP.get(m[1], m[1]), res)
res = NUMBER_PATTERN.sub(lambda m: num2words(m[0], lang="ja"), res)
res = __NUMBER_WITH_SEPARATOR_PATTERN.sub(lambda m: m[0].replace(",", ""), text)
res = __CURRENCY_PATTERN.sub(lambda m: m[2] + __CURRENCY_MAP.get(m[1], m[1]), res)
res = __NUMBER_PATTERN.sub(lambda m: num2words(m[0], lang="ja"), res)

return res

0 comments on commit d9b4a92

Please sign in to comment.