Skip to content

Commit

Permalink
🐛 fix: fix invalid utf8 character (lobehub#5732)
Browse files Browse the repository at this point in the history
* update docs

* update locale

* 🐛 fix: fix invalid utf-8 character

* improve log

* Update index.ts
  • Loading branch information
arvinxx authored Feb 4, 2025
1 parent 2b7076b commit 2905cb5
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 9 deletions.
4 changes: 3 additions & 1 deletion src/database/repositories/dataImporter/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
import { LobeChatDatabase } from '@/database/type';
import { ImportResult } from '@/services/config';
import { ImporterEntryData } from '@/types/importer';
import { sanitizeUTF8 } from '@/utils/sanitizeUTF8';

export class DataImporterRepos {
private userId: string;
Expand Down Expand Up @@ -204,9 +205,10 @@ export class DataImporterRepos {
// 2. insert messages
if (shouldInsertMessages.length > 0) {
const inertValues = shouldInsertMessages.map(
({ id, extra, createdAt, updatedAt, sessionId, topicId, ...res }) => ({
({ id, extra, createdAt, updatedAt, sessionId, topicId, content, ...res }) => ({
...res,
clientId: id,
content: sanitizeUTF8(content),
createdAt: new Date(createdAt),
model: extra?.fromModel,
parentId: null,
Expand Down
2 changes: 1 addition & 1 deletion src/libs/langchain/loaders/pdf/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';

export const PdfLoader = async (fileBlob: Blob) => {
const loader = new PDFLoader(fileBlob);
const loader = new PDFLoader(fileBlob, { splitPages: true });

return await loader.load();
};
14 changes: 7 additions & 7 deletions src/server/routers/async/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
IAsyncTaskError,
} from '@/types/asyncTask';
import { safeParseJSON } from '@/utils/safeParseJSON';
import { sanitizeUTF8 } from '@/utils/sanitizeUTF8';

const fileProcedure = asyncAuthedProcedure.use(async (opts) => {
const { ctx } = opts;
Expand Down Expand Up @@ -95,16 +96,13 @@ export const fileRouter = router({
ctx.jwtPayload,
);

const number = index + 1;
console.log(`执行第 ${number} 个任务`);
console.log(`run embedding task ${index + 1}`);

console.time(`任务[${number}]: embeddings`);
const embeddings = await agentRuntime.embeddings({
dimensions: 1024,
input: chunks.map((c) => c.text),
model,
});
console.timeEnd(`任务[${number}]: embeddings`);

const items: NewEmbeddingsItem[] =
embeddings?.map((e, idx) => ({
Expand All @@ -114,9 +112,7 @@ export const fileRouter = router({
model,
})) || [];

console.time(`任务[${number}]: insert db`);
await ctx.embeddingModel.bulkCreate(items);
console.timeEnd(`任务[${number}]: insert db`);
},
{ concurrency: CONCURRENCY },
);
Expand Down Expand Up @@ -215,7 +211,11 @@ export const fileRouter = router({

// after finish partition, we need to filter out some elements
const chunks = chunkResult.chunks.map(
(item): NewChunkItem => ({ ...item, userId: ctx.userId }),
({ text, ...item }): NewChunkItem => ({
...item,
text: text ? sanitizeUTF8(text) : '',
userId: ctx.userId,
}),
);

const duration = Date.now() - startAt;
Expand Down
23 changes: 23 additions & 0 deletions src/utils/sanitizeUTF8.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { sanitizeUTF8 } from './sanitizeUTF8';

describe('UTF-8 Sanitization', () => {
it('should handle null bytes', () => {
const input = 'test\u0000string';
expect(sanitizeUTF8(input)).toBe('teststring');
});

it('should handle invalid UTF-8 sequences', () => {
const input = 'test\uD800string'; // 未配对的代理项
expect(sanitizeUTF8(input)).toBe('teststring');
});

it('should handle invalid UTF-8 content', () => {
const input = '\u0002\u0000\u0000\u0002�{\\"error\\":{\\"code\\":\\"resource_exhausted\\",';
expect(sanitizeUTF8(input)).toBe('{\\"error\\":{\\"code\\":\\"resource_exhausted\\",');
});

it('should preserve valid UTF-8 characters', () => {
const input = '你好,世界!';
expect(sanitizeUTF8(input)).toBe('你好,世界!');
});
});
14 changes: 14 additions & 0 deletions src/utils/sanitizeUTF8.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* Sanitize UTF-8 string to remove all control characters and invalid code points.
* @param str
*/
export const sanitizeUTF8 = (str: string) => {
// 移除替换字符 (0xFFFD) 和其他非法字符
return (
str
.replaceAll('�', '') // 移除 Unicode 替换字符
// eslint-disable-next-line no-control-regex
.replaceAll(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, '') // 移除控制字符
.replaceAll(/[\uD800-\uDFFF]/g, '')
); // 移除未配对的代理项码点
};

0 comments on commit 2905cb5

Please sign in to comment.