From 9dcea49dbab90aa3c2c797c51786e79157f7b24f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 16 Aug 2024 22:10:03 +0800 Subject: [PATCH] Fix looking up OOVs in lexicon.txt for MeloTTS models. (#1266) If an English word does not exist in the lexicon, we split it into characters. For instance, if the word TTS does not exist in lexicon.txt, we split it into 3 characters T, T, and S. --- sherpa-onnx/csrc/melo-tts-lexicon.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.cc b/sherpa-onnx/csrc/melo-tts-lexicon.cc index fb39de8d2..e379b9c2f 100644 --- a/sherpa-onnx/csrc/melo-tts-lexicon.cc +++ b/sherpa-onnx/csrc/melo-tts-lexicon.cc @@ -136,6 +136,22 @@ class MeloTtsLexicon::Impl { ans.tokens.insert(ans.tokens.end(), ids.tokens.begin(), ids.tokens.end()); ans.tones.insert(ans.tones.end(), ids.tones.begin(), ids.tones.end()); + } else { + // If the lexicon does not contain the word, we split the word into + // characters. + // + // For instance, if the word is TTS and it is does not exist + // in the lexicon, we split it into 3 characters: T T S + std::string s; + for (char c : word) { + s = c; + if (word2ids_.count(s)) { + const auto &t = word2ids_.at(s); + ans.tokens.insert(ans.tokens.end(), t.tokens.begin(), + t.tokens.end()); + ans.tones.insert(ans.tones.end(), t.tones.begin(), t.tones.end()); + } + } } }