diff --git a/data/ofa_dataset.py b/data/ofa_dataset.py index fa30b24c..e3fd31a2 100644 --- a/data/ofa_dataset.py +++ b/data/ofa_dataset.py @@ -7,6 +7,11 @@ import re import torch.utils.data from fairseq.data import FairseqDataset +import string + +CHINESE_PUNCTUATION = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。' +ENGLISH_PUNCTUATION = string.punctuation + logger = logging.getLogger(__name__) @@ -77,3 +82,14 @@ def pre_caption(self, caption, max_words=None): caption = ' '.join(caption_words[:max_words]) return caption + + def pre_chinese(self, text, max_words): + text = text.lower().replace(CHINESE_PUNCTUATION, " ").replace(ENGLISH_PUNCTUATION, " ") + text = re.sub( + r"\s{2,}", + ' ', + text, + ) + text = text.rstrip('\n') + text = text.strip(' ')[:max_words] + return text diff --git a/data/s2t_data/unify_dataset.py b/data/s2t_data/unify_dataset.py index 986c1e89..08549b55 100644 --- a/data/s2t_data/unify_dataset.py +++ b/data/s2t_data/unify_dataset.py @@ -218,7 +218,6 @@ def __init__( self.text2phone_tokenizer = None if text2phone_path is not None: self.blank_id = self.phone_dict.index("") - self.phone_mask_idx = self.phone_dict.index("#1") self.text2phone_tokenizer = Text2Phone(text2phone_path) def set_epoch(self, epoch, **unused): diff --git a/tasks/speech_tasks/unify_task.py b/tasks/speech_tasks/unify_task.py index 1968e515..33b5fa73 100644 --- a/tasks/speech_tasks/unify_task.py +++ b/tasks/speech_tasks/unify_task.py @@ -152,7 +152,7 @@ def setup_task(cls, cfg: DictConfig, **kwargs): phone_dict = cls.load_dictionary(cfg.phone_dict_path) if cfg.text2phone_path is None: phone_dict.add_symbol("") - phone_dict.add_symbol("") + phone_dict.add_symbol("") # src_dict.add_symbol("") # tgt_dict.add_symbol("") @@ -358,6 +358,3 @@ def phone_dictionary(self): """Return the phone :class:`~fairseq.data.Dictionary`.""" return self.phone_dict - @property - def train_stage(self): - return self.train_stage diff --git a/utils/phone/zh/phone_dict.txt b/utils/phone/zh/phone_dict.txt index 6c775c39..c03774f4 100644 --- a/utils/phone/zh/phone_dict.txt +++ b/utils/phone/zh/phone_dict.txt @@ -134,3 +134,4 @@ zh 120 #14 134 #15 135 #16 136 +#17 137