7
7
#include < algorithm>
8
8
#include < sstream>
9
9
#include < cassert>
10
- #include < iostream> // 用于日志输出
10
+ #include < iostream>
11
+ #include " ../../../../../SDK/components/utilities/include/sample_log.h"
11
12
12
- // 使用引用传参优化split函数,避免不必要的拷贝
13
- std::vector<std::string> split ( const std::string &s, char delim) {
13
+ std::vector<std::string> split ( const std::string& s, char delim)
14
+ {
14
15
std::vector<std::string> result;
15
16
std::stringstream ss (s);
16
17
std::string item;
17
18
while (getline (ss, item, delim)) {
18
- if (!item.empty ()) { // 避免添加空字符串
19
+ if (!item.empty ()) {
19
20
result.push_back (item);
20
21
}
21
22
}
22
23
return result;
23
24
}
24
-
25
25
class Lexicon {
26
26
private:
27
27
std::unordered_map<std::string, std::pair<std::vector<int >, std::vector<int >>> lexicon;
28
- size_t max_phrase_length; // 追踪词典中最长的词组长度
29
- std::pair<std::vector<int >, std::vector<int >> unknown_token; // '_'的发音作为未知词的默认值
30
- std::unordered_map<int , std::string> reverse_tokens; // 用于将音素ID转回音素符号,用于日志
28
+ size_t max_phrase_length;
29
+ std::pair<std::vector<int >, std::vector<int >> unknown_token;
30
+ std::unordered_map<int , std::string> reverse_tokens;
31
31
32
32
public:
33
- Lexicon (const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0 ) {
33
+ Lexicon (const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0 )
34
+ {
34
35
std::unordered_map<std::string, int > tokens;
35
-
36
- // 加载tokens
37
36
std::ifstream ifs (tokens_filename);
38
37
assert (ifs.is_open ());
39
-
40
38
std::string line;
41
39
while (std::getline (ifs, line)) {
42
40
auto splitted_line = split (line, ' ' );
43
41
if (splitted_line.size () >= 2 ) {
44
42
int token_id = std::stoi (splitted_line[1 ]);
45
43
tokens.insert ({splitted_line[0 ], token_id});
46
- reverse_tokens[token_id] = splitted_line[0 ]; // 建立反向映射
44
+ reverse_tokens[token_id] = splitted_line[0 ];
47
45
}
48
46
}
49
47
ifs.close ();
50
-
51
- // 加载lexicon
52
48
ifs.open (lexicon_filename);
53
49
assert (ifs.is_open ());
54
50
while (std::getline (ifs, line)) {
55
51
auto splitted_line = split (line, ' ' );
56
52
if (splitted_line.empty ()) continue ;
57
-
58
53
std::string word_or_phrase = splitted_line[0 ];
59
-
60
- // 更新最长词组长度
61
- auto chars = splitEachChar (word_or_phrase);
62
- max_phrase_length = std::max (max_phrase_length, chars.size ());
63
-
64
- size_t phone_tone_len = splitted_line.size () - 1 ;
65
- size_t half_len = phone_tone_len / 2 ;
54
+ auto chars = splitEachChar (word_or_phrase);
55
+ max_phrase_length = std::max (max_phrase_length, chars.size ());
56
+ size_t phone_tone_len = splitted_line.size () - 1 ;
57
+ size_t half_len = phone_tone_len / 2 ;
66
58
std::vector<int > phones, tones;
67
-
68
59
for (size_t i = 0 ; i < phone_tone_len; i++) {
69
60
auto phone_or_tone = splitted_line[i + 1 ];
70
61
if (i < half_len) {
@@ -75,213 +66,161 @@ class Lexicon {
75
66
tones.push_back (std::stoi (phone_or_tone));
76
67
}
77
68
}
78
-
79
69
lexicon[word_or_phrase] = std::make_pair (phones, tones);
80
70
}
81
-
82
- // 添加特殊映射
83
- lexicon[" 呣" ] = lexicon[" 母" ];
84
- lexicon[" 嗯" ] = lexicon[" 恩" ];
85
-
86
- // 添加标点符号
87
71
const std::vector<std::string> punctuation{" !" , " ?" , " …" , " ," , " ." , " '" , " -" };
88
72
for (const auto & p : punctuation) {
89
73
if (tokens.find (p) != tokens.end ()) {
90
- int i = tokens[p];
74
+ int i = tokens[p];
91
75
lexicon[p] = std::make_pair (std::vector<int >{i}, std::vector<int >{0 });
92
76
}
93
77
}
94
-
95
- // 设置'_'作为未知词的发音
96
- assert (tokens.find (" _" ) != tokens.end ()); // 确保tokens中包含"_"
78
+ assert (tokens.find (" _" ) != tokens.end ());
97
79
unknown_token = std::make_pair (std::vector<int >{tokens[" _" ]}, std::vector<int >{0 });
98
-
99
- // 空格映射到'_'的发音
100
- lexicon[" " ] = unknown_token;
101
-
102
- // 中文标点转换映射
80
+ lexicon[" " ] = unknown_token;
103
81
lexicon[" ," ] = lexicon[" ," ];
104
82
lexicon[" 。" ] = lexicon[" ." ];
105
83
lexicon[" !" ] = lexicon[" !" ];
106
84
lexicon[" ?" ] = lexicon[" ?" ];
107
-
108
- // 输出词典信息
109
- std::cout << " 词典加载完成,包含 " << lexicon.size () << " 个条目,最长词组长度: " << max_phrase_length << std::endl;
85
+ SLOGI (" 词典加载完成,包含 %zu 个条目,最长词组长度: %zu" , lexicon.size (), max_phrase_length);
110
86
}
111
-
112
- std::vector<std::string> splitEachChar ( const std::string& text) {
87
+ std::vector<std::string> splitEachChar ( const std::string& text)
88
+ {
113
89
std::vector<std::string> words;
114
90
int len = text.length ();
115
- int i = 0 ;
116
-
91
+ int i = 0 ;
117
92
while (i < len) {
118
93
int next = 1 ;
119
94
if ((text[i] & 0x80 ) == 0x00 ) {
120
95
// ASCII
121
96
} else if ((text[i] & 0xE0 ) == 0xC0 ) {
122
- next = 2 ; // 2字节UTF-8
97
+ next = 2 ; // 2字节UTF-8
123
98
} else if ((text[i] & 0xF0 ) == 0xE0 ) {
124
- next = 3 ; // 3字节UTF-8
99
+ next = 3 ; // 3字节UTF-8
125
100
} else if ((text[i] & 0xF8 ) == 0xF0 ) {
126
- next = 4 ; // 4字节UTF-8
101
+ next = 4 ; // 4字节UTF-8
127
102
}
128
103
words.push_back (text.substr (i, next));
129
104
i += next;
130
105
}
131
106
return words;
132
- }
133
-
134
- bool is_english ( const std::string& s) {
107
+ }
108
+ bool is_english ( const std::string& s)
109
+ {
135
110
return s.size () == 1 && ((s[0 ] >= ' A' && s[0 ] <= ' Z' ) || (s[0 ] >= ' a' && s[0 ] <= ' z' ));
136
111
}
137
-
138
- // 根据词典中的内容,使用最长匹配算法处理输入文本
139
- void convert (const std::string& text, std::vector<int >& phones, std::vector<int >& tones) {
140
- std::cout << " \n 开始处理文本: \" " << text << " \" " << std::endl;
141
- std::cout << " =======匹配结果=======" << std::endl;
142
- std::cout << " 单元\t |\t 音素\t |\t 声调" << std::endl;
143
- std::cout << " -----------------------------" << std::endl;
144
-
145
- // 在开头添加'_'边界标记
112
+ void convert (const std::string& text, std::vector<int >& phones, std::vector<int >& tones)
113
+ {
114
+ SLOGI (" \n 开始处理文本: \" %s\" " , text.c_str ());
115
+ SLOGI (" =======匹配结果=======" );
116
+ SLOGI (" 单元\t |\t 音素\t |\t 声调" );
117
+ SLOGI (" -----------------------------" );
146
118
phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
147
119
tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
148
- std::cout << " <BOS> \t | \t " << phonesToString (unknown_token. first ) << " \t | \t "
149
- << tonesToString (unknown_token.second ) << std::endl;
150
-
120
+
121
+ SLOGI ( " <BOS> \t | \t %s \t | \t %s " , phonesToString (unknown_token.first ). c_str (),
122
+ tonesToString (unknown_token. second ). c_str ());
151
123
auto chars = splitEachChar (text);
152
- int i = 0 ;
153
-
124
+ int i = 0 ;
154
125
while (i < chars.size ()) {
155
- // 处理英文单词
156
126
if (is_english (chars[i])) {
157
127
std::string eng_word;
158
128
int start = i;
159
129
while (i < chars.size () && is_english (chars[i])) {
160
130
eng_word += chars[i++];
161
131
}
162
-
163
- // 英文转小写
164
- std::string orig_word = eng_word; // 保留原始单词用于日志
132
+ std::string orig_word = eng_word;
165
133
std::transform (eng_word.begin (), eng_word.end (), eng_word.begin (),
166
- [](unsigned char c){ return std::tolower (c); });
167
-
168
- // 如果词典中有这个英文单词,使用它;否则使用'_'的发音
134
+ [](unsigned char c) { return std::tolower (c); });
169
135
if (lexicon.find (eng_word) != lexicon.end ()) {
170
136
auto & [eng_phones, eng_tones] = lexicon[eng_word];
171
137
phones.insert (phones.end (), eng_phones.begin (), eng_phones.end ());
172
138
tones.insert (tones.end (), eng_tones.begin (), eng_tones.end ());
173
-
174
- // 打印匹配信息
175
- std::cout << orig_word << " \t |\t " << phonesToString (eng_phones) << " \t |\t "
176
- << tonesToString (eng_tones) << std::endl;
139
+ SLOGI (" %s\t |\t %s\t |\t %s" , orig_word.c_str (), phonesToString (eng_phones).c_str (),
140
+ tonesToString (eng_tones).c_str ());
177
141
} else {
178
- // 未找到单词,使用'_'的发音
179
142
phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
180
143
tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
181
-
182
- // 打印未匹配信息
183
- std::cout << orig_word << " \t |\t " << phonesToString (unknown_token.first ) << " (未匹配)\t |\t "
184
- << tonesToString (unknown_token.second ) << std::endl;
144
+ SLOGI (" %s\t |\t %s (未匹配)\t |\t %s" , orig_word.c_str (), phonesToString (unknown_token.first ).c_str (),
145
+ tonesToString (unknown_token.second ).c_str ());
185
146
}
186
147
continue ;
187
148
}
188
- // 处理非英文字符(如空格、标点)
189
149
std::string c = chars[i++];
190
- if (c == " " ) continue ; // 跳过空格
191
- // 回退一步,用于最长匹配
150
+ if (c == " " ) continue ;
192
151
i--;
193
-
194
-
195
- // 最长匹配算法处理中文/日文
196
152
bool matched = false ;
197
- // 尝试从最长的词组开始匹配
198
153
for (size_t len = std::min (max_phrase_length, chars.size () - i); len > 0 && !matched; --len) {
199
154
std::string phrase;
200
155
for (size_t j = 0 ; j < len; ++j) {
201
156
phrase += chars[i + j];
202
157
}
203
-
204
158
if (lexicon.find (phrase) != lexicon.end ()) {
205
159
auto & [phrase_phones, phrase_tones] = lexicon[phrase];
206
160
phones.insert (phones.end (), phrase_phones.begin (), phrase_phones.end ());
207
161
tones.insert (tones.end (), phrase_tones.begin (), phrase_tones.end ());
208
-
209
- // 打印匹配信息
210
- std::cout << phrase << " \t |\t " << phonesToString (phrase_phones) << " \t |\t "
211
- << tonesToString (phrase_tones) << std::endl;
212
-
162
+ SLOGI (" %s\t |\t %s\t |\t %s" , phrase.c_str (), phonesToString (phrase_phones).c_str (),
163
+ tonesToString (phrase_tones).c_str ());
213
164
i += len;
214
165
matched = true ;
215
166
break ;
216
167
}
217
168
}
218
-
219
- // 如果没有匹配到任何词组,使用'_'的发音
220
169
if (!matched) {
221
- std::string c = chars[i++];
222
- std::string s = c;
223
-
224
- // 中文标点符号转换
225
- std::string orig_char = s; // 保留原始字符用于日志
226
- if (s == " , " ) s = " , " ;
227
- else if (s == " 。 " ) s = " ." ;
228
- else if (s == " !" ) s = " ! " ;
229
- else if (s == " ? " ) s = " ? " ;
230
-
231
- // 如果词典中找不到,则使用'_'的发音
170
+ std::string c = chars[i++];
171
+ std::string s = c;
172
+ std::string orig_char = s;
173
+ if (s == " , " )
174
+ s = " , " ;
175
+ else if (s == " 。 " )
176
+ s = " ." ;
177
+ else if (s == " !" )
178
+ s = " ! " ;
179
+ else if (s == " ? " )
180
+ s = " ? " ;
232
181
if (lexicon.find (s) != lexicon.end ()) {
233
182
auto & [char_phones, char_tones] = lexicon[s];
234
183
phones.insert (phones.end (), char_phones.begin (), char_phones.end ());
235
184
tones.insert (tones.end (), char_tones.begin (), char_tones.end ());
236
-
237
- // 打印匹配信息
238
- std::cout << orig_char << " \t |\t " << phonesToString (char_phones) << " \t |\t "
239
- << tonesToString (char_tones) << std::endl;
185
+ SLOGI (" %s\t |\t %s\t |\t %s" , orig_char.c_str (), phonesToString (char_phones).c_str (),
186
+ tonesToString (char_tones).c_str ());
240
187
} else {
241
188
phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
242
189
tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
243
-
244
- // 打印未匹配信息
245
- std::cout << orig_char << " \t |\t " << phonesToString (unknown_token.first ) << " (未匹配)\t |\t "
246
- << tonesToString (unknown_token.second ) << std::endl;
190
+ SLOGI (" %s\t |\t %s (未匹配)\t |\t %s" , orig_char.c_str (), phonesToString (unknown_token.first ).c_str (),
191
+ tonesToString (unknown_token.second ).c_str ());
247
192
}
248
193
}
249
194
}
250
-
251
- // 在末尾添加'_'边界标记
252
195
phones.insert (phones.end (), unknown_token.first .begin (), unknown_token.first .end ());
253
196
tones.insert (tones.end (), unknown_token.second .begin (), unknown_token.second .end ());
254
- std::cout << " <EOS>\t |\t " << phonesToString (unknown_token.first ) << " \t |\t "
255
- << tonesToString (unknown_token.second ) << std::endl;
256
-
257
- // 汇总打印最终结果
258
- std::cout << " \n 处理结果汇总:" << std::endl;
259
- std::cout << " 原文: " << text << std::endl;
260
- std::cout << " 音素: " << phonesToString (phones) << std::endl;
261
- std::cout << " 声调: " << tonesToString (tones) << std::endl;
262
- std::cout << " ====================" << std::endl;
197
+ SLOGI (" <EOS>\t |\t %s\t |\t %s" , phonesToString (unknown_token.first ).c_str (),
198
+ tonesToString (unknown_token.second ).c_str ());
199
+ SLOGI (" \n 处理结果汇总:" );
200
+ SLOGI (" 原文: %s" , text.c_str ());
201
+ SLOGI (" 音素: %s" , phonesToString (phones).c_str ());
202
+ SLOGI (" 声调: %s" , tonesToString (tones).c_str ());
203
+ SLOGI (" ====================" );
263
204
}
264
205
265
206
private:
266
- // 处理单个字符
267
- void processChar ( const std::string& c, std::vector< int >& phones, std::vector< int >& tones) {
207
+ void processChar ( const std::string& c, std::vector< int >& phones, std::vector< int >& tones)
208
+ {
268
209
std::string s = c;
269
-
270
- // 中文标点符号转换
271
- if (s == " , " ) s = " , " ;
272
- else if (s == " 。 " ) s = " ." ;
273
- else if (s == " !" ) s = " ! " ;
274
- else if (s == " ? " ) s = " ? " ;
275
-
276
- // 如果词典中找不到,则使用'_'的发音
210
+ if (s == " , " )
211
+ s = " , " ;
212
+ else if (s == " 。 " )
213
+ s = " ." ;
214
+ else if (s == " !" )
215
+ s = " ! " ;
216
+ else if (s == " ? " )
217
+ s = " ? " ;
277
218
auto & phones_and_tones = (lexicon.find (s) != lexicon.end ()) ? lexicon[s] : unknown_token;
278
-
279
219
phones.insert (phones.end (), phones_and_tones.first .begin (), phones_and_tones.first .end ());
280
220
tones.insert (tones.end (), phones_and_tones.second .begin (), phones_and_tones.second .end ());
281
221
}
282
-
283
- // 将音素ID数组转换为字符串用于日志输出
284
- std::string phonesToString (const std::vector<int >& phones) {
222
+ std::string phonesToString (const std::vector<int >& phones)
223
+ {
285
224
std::string result;
286
225
for (auto id : phones) {
287
226
if (!result.empty ()) result += " " ;
@@ -293,14 +232,13 @@ class Lexicon {
293
232
}
294
233
return result;
295
234
}
296
-
297
- // 将声调数组转换为字符串用于日志输出
298
- std::string tonesToString (const std::vector<int >& tones) {
235
+ std::string tonesToString (const std::vector<int >& tones)
236
+ {
299
237
std::string result;
300
238
for (auto tone : tones) {
301
239
if (!result.empty ()) result += " " ;
302
240
result += std::to_string (tone);
303
241
}
304
242
return result;
305
243
}
306
- };
244
+ };
0 commit comments