Skip to content

Commit a09b133

Browse files
committed
prevents malformed utf-8 from entering logs_to_json
1 parent 321286e commit a09b133

File tree

1 file changed

+105
-4
lines changed

1 file changed

+105
-4
lines changed

examples/sd-server/main.cpp

Lines changed: 105 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,91 @@ std::string base64_encode(const unsigned char* data, size_t length) {
156156

157157
return encoded;
158158
}
159+
160+
struct Utf8SplitResult {
161+
std::string valid;
162+
std::string remainder;
163+
};
164+
165+
Utf8SplitResult extract_complete_utf8(const std::string& input) {
166+
Utf8SplitResult result;
167+
result.valid.reserve(input.size());
168+
169+
const std::size_t size = input.size();
170+
std::size_t i = 0;
171+
while (i < size) {
172+
unsigned char c = static_cast<unsigned char>(input[i]);
173+
if (c < 0x80) {
174+
result.valid.push_back(static_cast<char>(c));
175+
++i;
176+
continue;
177+
}
178+
179+
std::size_t expected = 0;
180+
if (c >= 0xC2 && c <= 0xDF) {
181+
expected = 2;
182+
} else if (c >= 0xE0 && c <= 0xEF) {
183+
expected = 3;
184+
} else if (c >= 0xF0 && c <= 0xF4) {
185+
expected = 4;
186+
} else {
187+
result.valid.push_back('?');
188+
++i;
189+
continue;
190+
}
191+
192+
if (i + expected > size) {
193+
result.remainder = input.substr(i);
194+
return result;
195+
}
196+
197+
bool valid_sequence = true;
198+
for (std::size_t j = 1; j < expected; ++j) {
199+
unsigned char continuation = static_cast<unsigned char>(input[i + j]);
200+
if ((continuation & 0xC0) != 0x80) {
201+
valid_sequence = false;
202+
break;
203+
}
204+
}
205+
if (!valid_sequence) {
206+
result.valid.push_back('?');
207+
++i;
208+
continue;
209+
}
210+
211+
if (expected == 3) {
212+
unsigned char b1 = static_cast<unsigned char>(input[i + 1]);
213+
if (c == 0xE0 && b1 < 0xA0) {
214+
result.valid.push_back('?');
215+
++i;
216+
continue;
217+
}
218+
if (c == 0xED && b1 >= 0xA0) {
219+
result.valid.push_back('?');
220+
++i;
221+
continue;
222+
}
223+
} else if (expected == 4) {
224+
unsigned char b1 = static_cast<unsigned char>(input[i + 1]);
225+
if (c == 0xF0 && b1 < 0x90) {
226+
result.valid.push_back('?');
227+
++i;
228+
continue;
229+
}
230+
if (c == 0xF4 && b1 >= 0x90) {
231+
result.valid.push_back('?');
232+
++i;
233+
continue;
234+
}
235+
}
236+
237+
result.valid.append(input, i, expected);
238+
i += expected;
239+
}
240+
241+
return result;
242+
}
243+
159244
struct CLIOptions {
160245
std::string model_path;
161246
int port = 8000;
@@ -361,6 +446,7 @@ struct ServerState {
361446
CtxConfig ctx_config;
362447
CtxConfig default_config;
363448
LogCollector* active_collector = nullptr;
449+
std::string pending_log_fragment;
364450
bool verbose = false;
365451
};
366452

@@ -1622,13 +1708,28 @@ void sd_server_log_callback(sd_log_level_t level, const char* text, void* user_d
16221708
}
16231709

16241710
ServerState* state = static_cast<ServerState*>(user_data);
1625-
std::string message(text);
1626-
while (!message.empty() && (message.back() == '\n' || message.back() == '\r')) {
1627-
message.pop_back();
1628-
}
1711+
std::string message;
1712+
bool only_partial = false;
16291713

16301714
{
16311715
std::lock_guard<std::mutex> guard(state->log_mutex);
1716+
1717+
std::string combined = state->pending_log_fragment;
1718+
combined.append(text);
1719+
1720+
Utf8SplitResult sanitized = extract_complete_utf8(combined);
1721+
state->pending_log_fragment = std::move(sanitized.remainder);
1722+
1723+
message = std::move(sanitized.valid);
1724+
while (!message.empty() && (message.back() == '\n' || message.back() == '\r')) {
1725+
message.pop_back();
1726+
}
1727+
1728+
only_partial = message.empty() && !state->pending_log_fragment.empty();
1729+
if (only_partial) {
1730+
return;
1731+
}
1732+
16321733
if (state->active_collector != nullptr) {
16331734
state->active_collector->add(level, message);
16341735
return;

0 commit comments

Comments
 (0)