diff --git a/src/database/repositories/dataImporter/index.ts b/src/database/repositories/dataImporter/index.ts index 67d4c610476e5..d41a0e0045315 100644 --- a/src/database/repositories/dataImporter/index.ts +++ b/src/database/repositories/dataImporter/index.ts @@ -14,6 +14,7 @@ import { import { LobeChatDatabase } from '@/database/type'; import { ImportResult } from '@/services/config'; import { ImporterEntryData } from '@/types/importer'; +import { sanitizeUTF8 } from '@/utils/sanitizeUTF8'; export class DataImporterRepos { private userId: string; @@ -204,9 +205,10 @@ export class DataImporterRepos { // 2. insert messages if (shouldInsertMessages.length > 0) { const inertValues = shouldInsertMessages.map( - ({ id, extra, createdAt, updatedAt, sessionId, topicId, ...res }) => ({ + ({ id, extra, createdAt, updatedAt, sessionId, topicId, content, ...res }) => ({ ...res, clientId: id, + content: sanitizeUTF8(content), createdAt: new Date(createdAt), model: extra?.fromModel, parentId: null, diff --git a/src/libs/langchain/loaders/pdf/index.ts b/src/libs/langchain/loaders/pdf/index.ts index 2ba2641602a32..f35054243ee00 100644 --- a/src/libs/langchain/loaders/pdf/index.ts +++ b/src/libs/langchain/loaders/pdf/index.ts @@ -1,7 +1,7 @@ import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; export const PdfLoader = async (fileBlob: Blob) => { - const loader = new PDFLoader(fileBlob); + const loader = new PDFLoader(fileBlob, { splitPages: true }); return await loader.load(); }; diff --git a/src/server/routers/async/file.ts b/src/server/routers/async/file.ts index e67621b4d26a1..dc6e639722d0c 100644 --- a/src/server/routers/async/file.ts +++ b/src/server/routers/async/file.ts @@ -24,6 +24,7 @@ import { IAsyncTaskError, } from '@/types/asyncTask'; import { safeParseJSON } from '@/utils/safeParseJSON'; +import { sanitizeUTF8 } from '@/utils/sanitizeUTF8'; const fileProcedure = asyncAuthedProcedure.use(async (opts) => { const { ctx } = opts; @@ -95,16 +96,13 @@ export const fileRouter = router({ ctx.jwtPayload, ); - const number = index + 1; - console.log(`执行第 ${number} 个任务`); + console.log(`run embedding task ${index + 1}`); - console.time(`任务[${number}]: embeddings`); const embeddings = await agentRuntime.embeddings({ dimensions: 1024, input: chunks.map((c) => c.text), model, }); - console.timeEnd(`任务[${number}]: embeddings`); const items: NewEmbeddingsItem[] = embeddings?.map((e, idx) => ({ @@ -114,9 +112,7 @@ export const fileRouter = router({ model, })) || []; - console.time(`任务[${number}]: insert db`); await ctx.embeddingModel.bulkCreate(items); - console.timeEnd(`任务[${number}]: insert db`); }, { concurrency: CONCURRENCY }, ); @@ -215,7 +211,11 @@ export const fileRouter = router({ // after finish partition, we need to filter out some elements const chunks = chunkResult.chunks.map( - (item): NewChunkItem => ({ ...item, userId: ctx.userId }), + ({ text, ...item }): NewChunkItem => ({ + ...item, + text: text ? sanitizeUTF8(text) : '', + userId: ctx.userId, + }), ); const duration = Date.now() - startAt; diff --git a/src/utils/sanitizeUTF8.test.ts b/src/utils/sanitizeUTF8.test.ts new file mode 100644 index 0000000000000..39e30c223e488 --- /dev/null +++ b/src/utils/sanitizeUTF8.test.ts @@ -0,0 +1,23 @@ +import { sanitizeUTF8 } from './sanitizeUTF8'; + +describe('UTF-8 Sanitization', () => { + it('should handle null bytes', () => { + const input = 'test\u0000string'; + expect(sanitizeUTF8(input)).toBe('teststring'); + }); + + it('should handle invalid UTF-8 sequences', () => { + const input = 'test\uD800string'; // 未配对的代理项 + expect(sanitizeUTF8(input)).toBe('teststring'); + }); + + it('should handle invalid UTF-8 content', () => { + const input = '\u0002\u0000\u0000\u0002�{\\"error\\":{\\"code\\":\\"resource_exhausted\\",'; + expect(sanitizeUTF8(input)).toBe('{\\"error\\":{\\"code\\":\\"resource_exhausted\\",'); + }); + + it('should preserve valid UTF-8 characters', () => { + const input = '你好,世界!'; + expect(sanitizeUTF8(input)).toBe('你好,世界!'); + }); +}); diff --git a/src/utils/sanitizeUTF8.ts b/src/utils/sanitizeUTF8.ts new file mode 100644 index 0000000000000..bed8adfc43d53 --- /dev/null +++ b/src/utils/sanitizeUTF8.ts @@ -0,0 +1,14 @@ +/** + * Sanitize UTF-8 string to remove all control characters and invalid code points. + * @param str + */ +export const sanitizeUTF8 = (str: string) => { + // 移除替换字符 (0xFFFD) 和其他非法字符 + return ( + str + .replaceAll('�', '') // 移除 Unicode 替换字符 + // eslint-disable-next-line no-control-regex + .replaceAll(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, '') // 移除控制字符 + .replaceAll(/[\uD800-\uDFFF]/g, '') + ); // 移除未配对的代理项码点 +};