Skip to content

Commit 2e40ae6

Browse files
committed
去掉中文注释;使用log输出日志;格式化代码
1 parent 1a90562 commit 2e40ae6

File tree

1 file changed

+84
-146
lines changed
  • projects/llm_framework/main_melotts/src/runner

1 file changed

+84
-146
lines changed

projects/llm_framework/main_melotts/src/runner/Lexicon.hpp

Lines changed: 84 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -7,64 +7,55 @@
77
#include <algorithm>
88
#include <sstream>
99
#include <cassert>
10-
#include <iostream> // 用于日志输出
10+
#include <iostream>
11+
#include "../../../../../SDK/components/utilities/include/sample_log.h"
1112

12-
// 使用引用传参优化split函数,避免不必要的拷贝
13-
std::vector<std::string> split(const std::string &s, char delim) {
13+
std::vector<std::string> split(const std::string& s, char delim)
14+
{
1415
std::vector<std::string> result;
1516
std::stringstream ss(s);
1617
std::string item;
1718
while (getline(ss, item, delim)) {
18-
if (!item.empty()) { // 避免添加空字符串
19+
if (!item.empty()) {
1920
result.push_back(item);
2021
}
2122
}
2223
return result;
2324
}
24-
2525
class Lexicon {
2626
private:
2727
std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<int>>> lexicon;
28-
size_t max_phrase_length; // 追踪词典中最长的词组长度
29-
std::pair<std::vector<int>, std::vector<int>> unknown_token; // '_'的发音作为未知词的默认值
30-
std::unordered_map<int, std::string> reverse_tokens; // 用于将音素ID转回音素符号,用于日志
28+
size_t max_phrase_length;
29+
std::pair<std::vector<int>, std::vector<int>> unknown_token;
30+
std::unordered_map<int, std::string> reverse_tokens;
3131

3232
public:
33-
Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0) {
33+
Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)
34+
{
3435
std::unordered_map<std::string, int> tokens;
35-
36-
// 加载tokens
3736
std::ifstream ifs(tokens_filename);
3837
assert(ifs.is_open());
39-
4038
std::string line;
4139
while (std::getline(ifs, line)) {
4240
auto splitted_line = split(line, ' ');
4341
if (splitted_line.size() >= 2) {
4442
int token_id = std::stoi(splitted_line[1]);
4543
tokens.insert({splitted_line[0], token_id});
46-
reverse_tokens[token_id] = splitted_line[0]; // 建立反向映射
44+
reverse_tokens[token_id] = splitted_line[0];
4745
}
4846
}
4947
ifs.close();
50-
51-
// 加载lexicon
5248
ifs.open(lexicon_filename);
5349
assert(ifs.is_open());
5450
while (std::getline(ifs, line)) {
5551
auto splitted_line = split(line, ' ');
5652
if (splitted_line.empty()) continue;
57-
5853
std::string word_or_phrase = splitted_line[0];
59-
60-
// 更新最长词组长度
61-
auto chars = splitEachChar(word_or_phrase);
62-
max_phrase_length = std::max(max_phrase_length, chars.size());
63-
64-
size_t phone_tone_len = splitted_line.size() - 1;
65-
size_t half_len = phone_tone_len / 2;
54+
auto chars = splitEachChar(word_or_phrase);
55+
max_phrase_length = std::max(max_phrase_length, chars.size());
56+
size_t phone_tone_len = splitted_line.size() - 1;
57+
size_t half_len = phone_tone_len / 2;
6658
std::vector<int> phones, tones;
67-
6859
for (size_t i = 0; i < phone_tone_len; i++) {
6960
auto phone_or_tone = splitted_line[i + 1];
7061
if (i < half_len) {
@@ -75,213 +66,161 @@ class Lexicon {
7566
tones.push_back(std::stoi(phone_or_tone));
7667
}
7768
}
78-
7969
lexicon[word_or_phrase] = std::make_pair(phones, tones);
8070
}
81-
82-
// 添加特殊映射
83-
lexicon[""] = lexicon[""];
84-
lexicon[""] = lexicon[""];
85-
86-
// 添加标点符号
8771
const std::vector<std::string> punctuation{"!", "?", "", ",", ".", "'", "-"};
8872
for (const auto& p : punctuation) {
8973
if (tokens.find(p) != tokens.end()) {
90-
int i = tokens[p];
74+
int i = tokens[p];
9175
lexicon[p] = std::make_pair(std::vector<int>{i}, std::vector<int>{0});
9276
}
9377
}
94-
95-
// 设置'_'作为未知词的发音
96-
assert(tokens.find("_") != tokens.end()); // 确保tokens中包含"_"
78+
assert(tokens.find("_") != tokens.end());
9779
unknown_token = std::make_pair(std::vector<int>{tokens["_"]}, std::vector<int>{0});
98-
99-
// 空格映射到'_'的发音
100-
lexicon[" "] = unknown_token;
101-
102-
// 中文标点转换映射
80+
lexicon[" "] = unknown_token;
10381
lexicon[""] = lexicon[","];
10482
lexicon[""] = lexicon["."];
10583
lexicon[""] = lexicon["!"];
10684
lexicon[""] = lexicon["?"];
107-
108-
// 输出词典信息
109-
std::cout << "词典加载完成,包含 " << lexicon.size() << " 个条目,最长词组长度: " << max_phrase_length << std::endl;
85+
SLOGI("词典加载完成,包含 %zu 个条目,最长词组长度: %zu", lexicon.size(), max_phrase_length);
11086
}
111-
112-
std::vector<std::string> splitEachChar(const std::string& text) {
87+
std::vector<std::string> splitEachChar(const std::string& text)
88+
{
11389
std::vector<std::string> words;
11490
int len = text.length();
115-
int i = 0;
116-
91+
int i = 0;
11792
while (i < len) {
11893
int next = 1;
11994
if ((text[i] & 0x80) == 0x00) {
12095
// ASCII
12196
} else if ((text[i] & 0xE0) == 0xC0) {
122-
next = 2; // 2字节UTF-8
97+
next = 2; // 2字节UTF-8
12398
} else if ((text[i] & 0xF0) == 0xE0) {
124-
next = 3; // 3字节UTF-8
99+
next = 3; // 3字节UTF-8
125100
} else if ((text[i] & 0xF8) == 0xF0) {
126-
next = 4; // 4字节UTF-8
101+
next = 4; // 4字节UTF-8
127102
}
128103
words.push_back(text.substr(i, next));
129104
i += next;
130105
}
131106
return words;
132-
}
133-
134-
bool is_english(const std::string& s) {
107+
}
108+
bool is_english(const std::string& s)
109+
{
135110
return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
136111
}
137-
138-
// 根据词典中的内容,使用最长匹配算法处理输入文本
139-
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones) {
140-
std::cout << "\n开始处理文本: \"" << text << "\"" << std::endl;
141-
std::cout << "=======匹配结果=======" << std::endl;
142-
std::cout << "单元\t|\t音素\t|\t声调" << std::endl;
143-
std::cout << "-----------------------------" << std::endl;
144-
145-
// 在开头添加'_'边界标记
112+
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
113+
{
114+
SLOGI("\n开始处理文本: \"%s\"", text.c_str());
115+
SLOGI("=======匹配结果=======");
116+
SLOGI("单元\t|\t音素\t|\t声调");
117+
SLOGI("-----------------------------");
146118
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
147119
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
148-
std::cout << "<BOS>\t|\t" << phonesToString(unknown_token.first) << "\t|\t"
149-
<< tonesToString(unknown_token.second) << std::endl;
150-
120+
121+
SLOGI("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
122+
tonesToString(unknown_token.second).c_str());
151123
auto chars = splitEachChar(text);
152-
int i = 0;
153-
124+
int i = 0;
154125
while (i < chars.size()) {
155-
// 处理英文单词
156126
if (is_english(chars[i])) {
157127
std::string eng_word;
158128
int start = i;
159129
while (i < chars.size() && is_english(chars[i])) {
160130
eng_word += chars[i++];
161131
}
162-
163-
// 英文转小写
164-
std::string orig_word = eng_word; // 保留原始单词用于日志
132+
std::string orig_word = eng_word;
165133
std::transform(eng_word.begin(), eng_word.end(), eng_word.begin(),
166-
[](unsigned char c){ return std::tolower(c); });
167-
168-
// 如果词典中有这个英文单词,使用它;否则使用'_'的发音
134+
[](unsigned char c) { return std::tolower(c); });
169135
if (lexicon.find(eng_word) != lexicon.end()) {
170136
auto& [eng_phones, eng_tones] = lexicon[eng_word];
171137
phones.insert(phones.end(), eng_phones.begin(), eng_phones.end());
172138
tones.insert(tones.end(), eng_tones.begin(), eng_tones.end());
173-
174-
// 打印匹配信息
175-
std::cout << orig_word << "\t|\t" << phonesToString(eng_phones) << "\t|\t"
176-
<< tonesToString(eng_tones) << std::endl;
139+
SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
140+
tonesToString(eng_tones).c_str());
177141
} else {
178-
// 未找到单词,使用'_'的发音
179142
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
180143
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
181-
182-
// 打印未匹配信息
183-
std::cout << orig_word << "\t|\t" << phonesToString(unknown_token.first) << " (未匹配)\t|\t"
184-
<< tonesToString(unknown_token.second) << std::endl;
144+
SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_word.c_str(), phonesToString(unknown_token.first).c_str(),
145+
tonesToString(unknown_token.second).c_str());
185146
}
186147
continue;
187148
}
188-
// 处理非英文字符(如空格、标点)
189149
std::string c = chars[i++];
190-
if (c == " ") continue; // 跳过空格
191-
// 回退一步,用于最长匹配
150+
if (c == " ") continue;
192151
i--;
193-
194-
195-
// 最长匹配算法处理中文/日文
196152
bool matched = false;
197-
// 尝试从最长的词组开始匹配
198153
for (size_t len = std::min(max_phrase_length, chars.size() - i); len > 0 && !matched; --len) {
199154
std::string phrase;
200155
for (size_t j = 0; j < len; ++j) {
201156
phrase += chars[i + j];
202157
}
203-
204158
if (lexicon.find(phrase) != lexicon.end()) {
205159
auto& [phrase_phones, phrase_tones] = lexicon[phrase];
206160
phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end());
207161
tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end());
208-
209-
// 打印匹配信息
210-
std::cout << phrase << "\t|\t" << phonesToString(phrase_phones) << "\t|\t"
211-
<< tonesToString(phrase_tones) << std::endl;
212-
162+
SLOGI("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
163+
tonesToString(phrase_tones).c_str());
213164
i += len;
214165
matched = true;
215166
break;
216167
}
217168
}
218-
219-
// 如果没有匹配到任何词组,使用'_'的发音
220169
if (!matched) {
221-
std::string c = chars[i++];
222-
std::string s = c;
223-
224-
// 中文标点符号转换
225-
std::string orig_char = s; // 保留原始字符用于日志
226-
if (s == "") s = ",";
227-
else if (s == "") s = ".";
228-
else if (s == "") s = "!";
229-
else if (s == "") s = "?";
230-
231-
// 如果词典中找不到,则使用'_'的发音
170+
std::string c = chars[i++];
171+
std::string s = c;
172+
std::string orig_char = s;
173+
if (s == "")
174+
s = ",";
175+
else if (s == "")
176+
s = ".";
177+
else if (s == "")
178+
s = "!";
179+
else if (s == "")
180+
s = "?";
232181
if (lexicon.find(s) != lexicon.end()) {
233182
auto& [char_phones, char_tones] = lexicon[s];
234183
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
235184
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
236-
237-
// 打印匹配信息
238-
std::cout << orig_char << "\t|\t" << phonesToString(char_phones) << "\t|\t"
239-
<< tonesToString(char_tones) << std::endl;
185+
SLOGI("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
186+
tonesToString(char_tones).c_str());
240187
} else {
241188
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
242189
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
243-
244-
// 打印未匹配信息
245-
std::cout << orig_char << "\t|\t" << phonesToString(unknown_token.first) << " (未匹配)\t|\t"
246-
<< tonesToString(unknown_token.second) << std::endl;
190+
SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_char.c_str(), phonesToString(unknown_token.first).c_str(),
191+
tonesToString(unknown_token.second).c_str());
247192
}
248193
}
249194
}
250-
251-
// 在末尾添加'_'边界标记
252195
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
253196
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
254-
std::cout << "<EOS>\t|\t" << phonesToString(unknown_token.first) << "\t|\t"
255-
<< tonesToString(unknown_token.second) << std::endl;
256-
257-
// 汇总打印最终结果
258-
std::cout << "\n处理结果汇总:" << std::endl;
259-
std::cout << "原文: " << text << std::endl;
260-
std::cout << "音素: " << phonesToString(phones) << std::endl;
261-
std::cout << "声调: " << tonesToString(tones) << std::endl;
262-
std::cout << "====================" << std::endl;
197+
SLOGI("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
198+
tonesToString(unknown_token.second).c_str());
199+
SLOGI("\n处理结果汇总:");
200+
SLOGI("原文: %s", text.c_str());
201+
SLOGI("音素: %s", phonesToString(phones).c_str());
202+
SLOGI("声调: %s", tonesToString(tones).c_str());
203+
SLOGI("====================");
263204
}
264205

265206
private:
266-
// 处理单个字符
267-
void processChar(const std::string& c, std::vector<int>& phones, std::vector<int>& tones) {
207+
void processChar(const std::string& c, std::vector<int>& phones, std::vector<int>& tones)
208+
{
268209
std::string s = c;
269-
270-
// 中文标点符号转换
271-
if (s == "") s = ",";
272-
else if (s == "") s = ".";
273-
else if (s == "") s = "!";
274-
else if (s == "") s = "?";
275-
276-
// 如果词典中找不到,则使用'_'的发音
210+
if (s == "")
211+
s = ",";
212+
else if (s == "")
213+
s = ".";
214+
else if (s == "")
215+
s = "!";
216+
else if (s == "")
217+
s = "?";
277218
auto& phones_and_tones = (lexicon.find(s) != lexicon.end()) ? lexicon[s] : unknown_token;
278-
279219
phones.insert(phones.end(), phones_and_tones.first.begin(), phones_and_tones.first.end());
280220
tones.insert(tones.end(), phones_and_tones.second.begin(), phones_and_tones.second.end());
281221
}
282-
283-
// 将音素ID数组转换为字符串用于日志输出
284-
std::string phonesToString(const std::vector<int>& phones) {
222+
std::string phonesToString(const std::vector<int>& phones)
223+
{
285224
std::string result;
286225
for (auto id : phones) {
287226
if (!result.empty()) result += " ";
@@ -293,14 +232,13 @@ class Lexicon {
293232
}
294233
return result;
295234
}
296-
297-
// 将声调数组转换为字符串用于日志输出
298-
std::string tonesToString(const std::vector<int>& tones) {
235+
std::string tonesToString(const std::vector<int>& tones)
236+
{
299237
std::string result;
300238
for (auto tone : tones) {
301239
if (!result.empty()) result += " ";
302240
result += std::to_string(tone);
303241
}
304242
return result;
305243
}
306-
};
244+
};

0 commit comments

Comments
 (0)