From 164b5c4a85744b259598bcac4b89b20dbf966e8f Mon Sep 17 00:00:00 2001 From: litagin02 Date: Mon, 11 Mar 2024 10:33:00 +0900 Subject: [PATCH] Clean unused tools module (previously used in webui.py) --- tools/__init__.py | 3 - tools/classify_language.py | 197 ------------------------------------- tools/sentence.py | 173 -------------------------------- tools/translate.py | 62 ------------ 4 files changed, 435 deletions(-) delete mode 100644 tools/__init__.py delete mode 100644 tools/classify_language.py delete mode 100644 tools/sentence.py delete mode 100644 tools/translate.py diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index b68d33295..000000000 --- a/tools/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -工具包 -""" diff --git a/tools/classify_language.py b/tools/classify_language.py deleted file mode 100644 index 2b8a7ab42..000000000 --- a/tools/classify_language.py +++ /dev/null @@ -1,197 +0,0 @@ -import regex as re - -try: - from config import config - - LANGUAGE_IDENTIFICATION_LIBRARY = ( - config.webui_config.language_identification_library - ) -except: - LANGUAGE_IDENTIFICATION_LIBRARY = "langid" - -module = LANGUAGE_IDENTIFICATION_LIBRARY.lower() - -langid_languages = [ - "af", - "am", - "an", - "ar", - "as", - "az", - "be", - "bg", - "bn", - "br", - "bs", - "ca", - "cs", - "cy", - "da", - "de", - "dz", - "el", - "en", - "eo", - "es", - "et", - "eu", - "fa", - "fi", - "fo", - "fr", - "ga", - "gl", - "gu", - "he", - "hi", - "hr", - "ht", - "hu", - "hy", - "id", - "is", - "it", - "ja", - "jv", - "ka", - "kk", - "km", - "kn", - "ko", - "ku", - "ky", - "la", - "lb", - "lo", - "lt", - "lv", - "mg", - "mk", - "ml", - "mn", - "mr", - "ms", - "mt", - "nb", - "ne", - "nl", - "nn", - "no", - "oc", - "or", - "pa", - "pl", - "ps", - "pt", - "qu", - "ro", - "ru", - "rw", - "se", - "si", - "sk", - "sl", - "sq", - "sr", - "sv", - "sw", - "ta", - "te", - "th", - "tl", - "tr", - "ug", - "uk", - "ur", - "vi", - "vo", - "wa", - "xh", - "zh", - "zu", -] - - -def classify_language(text: str, target_languages: list = None) -> str: - if module == "fastlid" or module == "fasttext": - from fastlid import fastlid, supported_langs - - classifier = fastlid - if target_languages != None: - target_languages = [ - lang for lang in target_languages if lang in supported_langs - ] - fastlid.set_languages = target_languages - elif module == "langid": - import langid - - classifier = langid.classify - if target_languages != None: - target_languages = [ - lang for lang in target_languages if lang in langid_languages - ] - langid.set_languages(target_languages) - else: - raise ValueError(f"Wrong module {module}") - - lang = classifier(text)[0] - - return lang - - -def classify_zh_ja(text: str) -> str: - for idx, char in enumerate(text): - unicode_val = ord(char) - - # 检测日语字符 - if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF: - return "ja" - - # 检测汉字字符 - if 0x4E00 <= unicode_val <= 0x9FFF: - # 检查周围的字符 - next_char = text[idx + 1] if idx + 1 < len(text) else None - - if next_char and ( - 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF - ): - return "ja" - - return "zh" - - -def split_alpha_nonalpha(text, mode=1): - if mode == 1: - pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d\s])(?=[\p{Latin}])|(?<=[\p{Latin}\s])(?=[\u4e00-\u9fff\u3040-\u30FF\d])" - elif mode == 2: - pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\s])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d\s])(?=[\u4e00-\u9fff\u3040-\u30FF])" - else: - raise ValueError("Invalid mode. Supported modes are 1 and 2.") - - return re.split(pattern, text) - - -if __name__ == "__main__": - text = "这是一个测试文本" - print(classify_language(text)) - print(classify_zh_ja(text)) # "zh" - - text = "これはテストテキストです" - print(classify_language(text)) - print(classify_zh_ja(text)) # "ja" - - text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days" - - print(split_alpha_nonalpha(text, mode=1)) - # output: ['vits', '和', 'Bert-VITS', '2是', 'tts', '模型。花费3', 'days.花费3天。Take 3 days'] - - print(split_alpha_nonalpha(text, mode=2)) - # output: ['vits', '和', 'Bert-VITS2', '是', 'tts', '模型。花费', '3days.花费', '3', '天。Take 3 days'] - - text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days" - print(split_alpha_nonalpha(text, mode=1)) - # output: ['vits ', '和 ', 'Bert-VITS', '2 ', '是 ', 'tts ', '模型。花费3', 'days.花费3天。Take ', '3 ', 'days'] - - text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days" - print(split_alpha_nonalpha(text, mode=2)) - # output: ['vits ', '和 ', 'Bert-VITS2 ', '是 ', 'tts ', '模型。花费', '3days.花费', '3', '天。Take ', '3 ', 'days'] diff --git a/tools/sentence.py b/tools/sentence.py deleted file mode 100644 index b66864ca0..000000000 --- a/tools/sentence.py +++ /dev/null @@ -1,173 +0,0 @@ -import logging - -import regex as re - -from tools.classify_language import classify_language, split_alpha_nonalpha - - -def check_is_none(item) -> bool: - """none -> True, not none -> False""" - return ( - item is None - or (isinstance(item, str) and str(item).isspace()) - or str(item) == "" - ) - - -def markup_language(text: str, target_languages: list = None) -> str: - pattern = ( - r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" - r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" - r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" - ) - sentences = re.split(pattern, text) - - pre_lang = "" - p = 0 - - if target_languages is not None: - sorted_target_languages = sorted(target_languages) - if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: - new_sentences = [] - for sentence in sentences: - new_sentences.extend(split_alpha_nonalpha(sentence)) - sentences = new_sentences - - for sentence in sentences: - if check_is_none(sentence): - continue - - lang = classify_language(sentence, target_languages) - - if pre_lang == "": - text = text[:p] + text[p:].replace( - sentence, f"[{lang.upper()}]{sentence}", 1 - ) - p += len(f"[{lang.upper()}]") - elif pre_lang != lang: - text = text[:p] + text[p:].replace( - sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1 - ) - p += len(f"[{pre_lang.upper()}][{lang.upper()}]") - pre_lang = lang - p += text[p:].index(sentence) + len(sentence) - text += f"[{pre_lang.upper()}]" - - return text - - -def split_by_language(text: str, target_languages: list = None) -> list: - pattern = ( - r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" - r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" - r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" - ) - sentences = re.split(pattern, text) - - pre_lang = "" - start = 0 - end = 0 - sentences_list = [] - - if target_languages is not None: - sorted_target_languages = sorted(target_languages) - if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: - new_sentences = [] - for sentence in sentences: - new_sentences.extend(split_alpha_nonalpha(sentence)) - sentences = new_sentences - - for sentence in sentences: - if check_is_none(sentence): - continue - - lang = classify_language(sentence, target_languages) - - end += text[end:].index(sentence) - if pre_lang != "" and pre_lang != lang: - sentences_list.append((text[start:end], pre_lang)) - start = end - end += len(sentence) - pre_lang = lang - sentences_list.append((text[start:], pre_lang)) - - return sentences_list - - -def sentence_split(text: str, max: int) -> list: - pattern = r"[!(),—+\-.:;??。,、;:]+" - sentences = re.split(pattern, text) - discarded_chars = re.findall(pattern, text) - - sentences_list, count, p = [], 0, 0 - - # 按被分割的符号遍历 - for i, discarded_chars in enumerate(discarded_chars): - count += len(sentences[i]) + len(discarded_chars) - if count >= max: - sentences_list.append(text[p : p + count].strip()) - p += count - count = 0 - - # 加入最后剩余的文本 - if p < len(text): - sentences_list.append(text[p:]) - - return sentences_list - - -def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None): - # 如果该speaker只支持一种语言 - if speaker_lang is not None and len(speaker_lang) == 1: - if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]: - logging.debug( - f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}' - ) - lang = speaker_lang[0] - - sentences_list = [] - if lang.upper() != "MIX": - if max <= 0: - sentences_list.append( - markup_language(text, speaker_lang) - if lang.upper() == "AUTO" - else f"[{lang.upper()}]{text}[{lang.upper()}]" - ) - else: - for i in sentence_split(text, max): - if check_is_none(i): - continue - sentences_list.append( - markup_language(i, speaker_lang) - if lang.upper() == "AUTO" - else f"[{lang.upper()}]{i}[{lang.upper()}]" - ) - else: - sentences_list.append(text) - - for i in sentences_list: - logging.debug(i) - - return sentences_list - - -if __name__ == "__main__": - text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。" - print(markup_language(text, target_languages=None)) - print(sentence_split(text, max=50)) - print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None)) - - text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。" - print(split_by_language(text, ["zh", "ja", "en"])) - - text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days" - - print(split_by_language(text, ["zh", "ja", "en"])) - # output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')] - - print(split_by_language(text, ["zh", "en"])) - # output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')] - - text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days" - print(split_by_language(text, ["zh", "en"])) - # output: [('vits ', 'en'), ('和 ', 'zh'), ('Bert-VITS2 ', 'en'), ('是 ', 'zh'), ('tts ', 'en'), ('模型。花费 ', 'zh'), ('3 days. ', 'en'), ('花费 3天。', 'zh'), ('Take 3 days', 'en')] diff --git a/tools/translate.py b/tools/translate.py deleted file mode 100644 index be0f7ea45..000000000 --- a/tools/translate.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -翻译api -""" - -from config import config - -import random -import hashlib -import requests - - -def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""): - """ - :param Sentence: 待翻译语句 - :param from_Language: 待翻译语句语言 - :param to_Language: 目标语言 - :return: 翻译后语句 出错时返回None - - 常见语言代码:中文 zh 英语 en 日语 jp - """ - appid = config.translate_config.app_key - key = config.translate_config.secret_key - if appid == "" or key == "": - return "请开发者在config.yml中配置app_key与secret_key" - url = "https://fanyi-api.baidu.com/api/trans/vip/translate" - texts = Sentence.splitlines() - outTexts = [] - for t in texts: - if t != "": - # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113 - salt = str(random.randint(1, 100000)) - signString = appid + t + salt + key - hs = hashlib.md5() - hs.update(signString.encode("utf-8")) - signString = hs.hexdigest() - if from_Language == "": - from_Language = "auto" - headers = {"Content-Type": "application/x-www-form-urlencoded"} - payload = { - "q": t, - "from": from_Language, - "to": to_Language, - "appid": appid, - "salt": salt, - "sign": signString, - } - # 发送请求 - try: - response = requests.post( - url=url, data=payload, headers=headers, timeout=3 - ) - response = response.json() - if "trans_result" in response.keys(): - result = response["trans_result"][0] - if "dst" in result.keys(): - dst = result["dst"] - outTexts.append(dst) - except Exception: - return Sentence - else: - outTexts.append(t) - return "\n".join(outTexts)