Improve: NLP performance

litagin02 · May 11, 2024 · d9b4a92 · d9b4a92
1 parent 2d6baa3
commit d9b4a92
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 127 deletions.
diff --git a/style_bert_vits2/nlp/chinese/normalizer.py b/style_bert_vits2/nlp/chinese/normalizer.py
@@ -5,6 +5,41 @@
 from style_bert_vits2.nlp.symbols import PUNCTUATIONS
 
 
+__REPLACE_MAP = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": "…",
+    "$": ".",
+    "“": "'",
+    "”": "'",
+    '"': "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "～": "-",
+    "~": "-",
+    "「": "'",
+    "」": "'",
+}
+
+
 def normalize_text(text: str) -> str:
     numbers = re.findall(r"\d+(?:\.?\d+)?", text)
     for number in numbers:
@@ -15,44 +50,10 @@ def normalize_text(text: str) -> str:
 
 def replace_punctuation(text: str) -> str:
 
-    REPLACE_MAP = {
-        "：": ",",
-        "；": ",",
-        "，": ",",
-        "。": ".",
-        "！": "!",
-        "？": "?",
-        "\n": ".",
-        "·": ",",
-        "、": ",",
-        "...": "…",
-        "$": ".",
-        "“": "'",
-        "”": "'",
-        '"': "'",
-        "‘": "'",
-        "’": "'",
-        "（": "'",
-        "）": "'",
-        "(": "'",
-        ")": "'",
-        "《": "'",
-        "》": "'",
-        "【": "'",
-        "】": "'",
-        "[": "'",
-        "]": "'",
-        "—": "-",
-        "～": "-",
-        "~": "-",
-        "「": "'",
-        "」": "'",
-    }
-
     text = text.replace("嗯", "恩").replace("呣", "母")
-    pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP.keys()))
+    pattern = re.compile("|".join(re.escape(p) for p in __REPLACE_MAP.keys()))
 
-    replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
+    replaced_text = pattern.sub(lambda x: __REPLACE_MAP[x.group()], text)
 
     replaced_text = re.sub(
         r"[^\u4e00-\u9fa5" + "".join(PUNCTUATIONS) + r"]+", "", replaced_text

diff --git a/style_bert_vits2/nlp/japanese/g2p_utils.py b/style_bert_vits2/nlp/japanese/g2p_utils.py
@@ -6,6 +6,16 @@
 from style_bert_vits2.nlp.symbols import PUNCTUATIONS
 
 
+# 子音の集合
+__CONSONANTS = set(
+    [
+        consonant
+        for consonant, _ in MORA_KATA_TO_MORA_PHONEMES.values()
+        if consonant is not None
+    ]
+)
+
+
 def g2kata_tone(norm_text: str) -> list[tuple[str, int]]:
     """
     テキストからカタカナとアクセントのペアのリストを返す。
@@ -33,15 +43,6 @@ def phone_tone2kata_tone(phone_tone: list[tuple[str, int]]) -> list[tuple[str, i
         カタカナと音高のリスト。
     """
 
-    # 子音の集合
-    CONSONANTS = set(
-        [
-            consonant
-            for consonant, _ in MORA_KATA_TO_MORA_PHONEMES.values()
-            if consonant is not None
-        ]
-    )
-
     phone_tone = phone_tone[1:]  # 最初の("_", 0)を無視
     phones = [phone for phone, _ in phone_tone]
     tones = [tone for _, tone in phone_tone]
@@ -52,7 +53,7 @@ def phone_tone2kata_tone(phone_tone: list[tuple[str, int]]) -> list[tuple[str, i
         if phone in PUNCTUATIONS:
             result.append((phone, tone))
             continue
-        if phone in CONSONANTS:  # n以外の子音の場合
+        if phone in __CONSONANTS:  # n以外の子音の場合
             assert current_mora == "", f"Unexpected {phone} after {current_mora}"
             assert tone == next_tone, f"Unexpected {phone} tone {tone} != {next_tone}"
             current_mora = phone

diff --git a/style_bert_vits2/nlp/japanese/normalizer.py b/style_bert_vits2/nlp/japanese/normalizer.py
@@ -6,6 +6,81 @@
 from style_bert_vits2.nlp.symbols import PUNCTUATIONS
 
 
+# 記号類の正規化マップ
+__REPLACE_MAP = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "．": ".",
+    "…": "...",
+    "···": "...",
+    "・・・": "...",
+    "·": ",",
+    "・": ",",
+    "、": ",",
+    "$": ".",
+    "“": "'",
+    "”": "'",
+    '"': "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    # NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換
+    "\u02d7": "\u002d",  # ˗, Modifier Letter Minus Sign
+    "\u2010": "\u002d",  # ‐, Hyphen,
+    # "\u2011": "\u002d",  # ‑, Non-Breaking Hyphen, NFKC により \u2010 に変換される
+    "\u2012": "\u002d",  # ‒, Figure Dash
+    "\u2013": "\u002d",  # –, En Dash
+    "\u2014": "\u002d",  # —, Em Dash
+    "\u2015": "\u002d",  # ―, Horizontal Bar
+    "\u2043": "\u002d",  # ⁃, Hyphen Bullet
+    "\u2212": "\u002d",  # −, Minus Sign
+    "\u23af": "\u002d",  # ⎯, Horizontal Line Extension
+    "\u23e4": "\u002d",  # ⏤, Straightness
+    "\u2500": "\u002d",  # ─, Box Drawings Light Horizontal
+    "\u2501": "\u002d",  # ━, Box Drawings Heavy Horizontal
+    "\u2e3a": "\u002d",  # ⸺, Two-Em Dash
+    "\u2e3b": "\u002d",  # ⸻, Three-Em Dash
+    # "～": "-",  # これは長音記号「ー」として扱うよう変更
+    # "~": "-",  # これも長音記号「ー」として扱うよう変更
+    "「": "'",
+    "」": "'",
+}
+# 記号類の正規化パターン
+__REPLACE_PATTERN = re.compile("|".join(re.escape(p) for p in __REPLACE_MAP.keys()))
+# 句読点等の正規化パターン
+__PUNCTUATION_CLEANUP_PATTERN = re.compile(
+    # ↓ ひらがな、カタカナ、漢字
+    r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
+    # ↓ 半角アルファベット（大文字と小文字）
+    + r"\u0041-\u005A\u0061-\u007A"
+    # ↓ 全角アルファベット（大文字と小文字）
+    + r"\uFF21-\uFF3A\uFF41-\uFF5A"
+    # ↓ ギリシャ文字
+    + r"\u0370-\u03FF\u1F00-\u1FFF"
+    # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
+    + "".join(PUNCTUATIONS) + r"]+",
+)
+# 数字・通貨記号の正規化パターン
+__CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
+__CURRENCY_PATTERN = re.compile(r"([$¥£€])([0-9.]*[0-9])")
+__NUMBER_PATTERN = re.compile(r"[0-9]+(\.[0-9]+)?")
+__NUMBER_WITH_SEPARATOR_PATTERN = re.compile("[0-9]{1,3}(,[0-9]{3})+")
+
+
 def normalize_text(text: str) -> str:
     """
     日本語のテキストを正規化する。
@@ -62,80 +137,11 @@ def replace_punctuation(text: str) -> str:
         str: 正規化されたテキスト
     """
 
-    # 記号類の正規化変換マップ
-    REPLACE_MAP = {
-        "：": ",",
-        "；": ",",
-        "，": ",",
-        "。": ".",
-        "！": "!",
-        "？": "?",
-        "\n": ".",
-        "．": ".",
-        "…": "...",
-        "···": "...",
-        "・・・": "...",
-        "·": ",",
-        "・": ",",
-        "、": ",",
-        "$": ".",
-        "“": "'",
-        "”": "'",
-        '"': "'",
-        "‘": "'",
-        "’": "'",
-        "（": "'",
-        "）": "'",
-        "(": "'",
-        ")": "'",
-        "《": "'",
-        "》": "'",
-        "【": "'",
-        "】": "'",
-        "[": "'",
-        "]": "'",
-        # NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換
-        "\u02d7": "\u002d",  # ˗, Modifier Letter Minus Sign
-        "\u2010": "\u002d",  # ‐, Hyphen,
-        # "\u2011": "\u002d",  # ‑, Non-Breaking Hyphen, NFKC により \u2010 に変換される
-        "\u2012": "\u002d",  # ‒, Figure Dash
-        "\u2013": "\u002d",  # –, En Dash
-        "\u2014": "\u002d",  # —, Em Dash
-        "\u2015": "\u002d",  # ―, Horizontal Bar
-        "\u2043": "\u002d",  # ⁃, Hyphen Bullet
-        "\u2212": "\u002d",  # −, Minus Sign
-        "\u23af": "\u002d",  # ⎯, Horizontal Line Extension
-        "\u23e4": "\u002d",  # ⏤, Straightness
-        "\u2500": "\u002d",  # ─, Box Drawings Light Horizontal
-        "\u2501": "\u002d",  # ━, Box Drawings Heavy Horizontal
-        "\u2e3a": "\u002d",  # ⸺, Two-Em Dash
-        "\u2e3b": "\u002d",  # ⸻, Three-Em Dash
-        # "～": "-",  # これは長音記号「ー」として扱うよう変更
-        # "~": "-",  # これも長音記号「ー」として扱うよう変更
-        "「": "'",
-        "」": "'",
-    }
-
-    pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP.keys()))
-
     # 句読点を辞書で置換
-    replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
-
-    replaced_text = re.sub(
-        # ↓ ひらがな、カタカナ、漢字
-        r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
-        # ↓ 半角アルファベット（大文字と小文字）
-        + r"\u0041-\u005A\u0061-\u007A"
-        # ↓ 全角アルファベット（大文字と小文字）
-        + r"\uFF21-\uFF3A\uFF41-\uFF5A"
-        # ↓ ギリシャ文字
-        + r"\u0370-\u03FF\u1F00-\u1FFF"
-        # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
-        + "".join(PUNCTUATIONS) + r"]+",
-        # 上述以外の文字を削除
-        "",
-        replaced_text,
-    )
+    replaced_text = __REPLACE_PATTERN.sub(lambda x: __REPLACE_MAP[x.group()], text)
+
+    # 上述以外の文字を削除
+    replaced_text = __PUNCTUATION_CLEANUP_PATTERN.sub("", replaced_text)
 
     return replaced_text
 
@@ -151,13 +157,8 @@ def __convert_numbers_to_words(text: str) -> str:
         str: 変換されたテキスト
     """
 
-    NUMBER_WITH_SEPARATOR_PATTERN = re.compile("[0-9]{1,3}(,[0-9]{3})+")
-    CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
-    CURRENCY_PATTERN = re.compile(r"([$¥£€])([0-9.]*[0-9])")
-    NUMBER_PATTERN = re.compile(r"[0-9]+(\.[0-9]+)?")
-
-    res = NUMBER_WITH_SEPARATOR_PATTERN.sub(lambda m: m[0].replace(",", ""), text)
-    res = CURRENCY_PATTERN.sub(lambda m: m[2] + CURRENCY_MAP.get(m[1], m[1]), res)
-    res = NUMBER_PATTERN.sub(lambda m: num2words(m[0], lang="ja"), res)
+    res = __NUMBER_WITH_SEPARATOR_PATTERN.sub(lambda m: m[0].replace(",", ""), text)
+    res = __CURRENCY_PATTERN.sub(lambda m: m[2] + __CURRENCY_MAP.get(m[1], m[1]), res)
+    res = __NUMBER_PATTERN.sub(lambda m: num2words(m[0], lang="ja"), res)
 
     return res