forked from lobehub/lobe-chat
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🐛 fix: fix invalid utf8 character (lobehub#5732)
* update docs * update locale * 🐛 fix: fix invalid utf-8 character * improve log * Update index.ts
- Loading branch information
Showing
5 changed files
with
48 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; | ||
|
||
export const PdfLoader = async (fileBlob: Blob) => { | ||
const loader = new PDFLoader(fileBlob); | ||
const loader = new PDFLoader(fileBlob, { splitPages: true }); | ||
|
||
return await loader.load(); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import { sanitizeUTF8 } from './sanitizeUTF8'; | ||
|
||
describe('UTF-8 Sanitization', () => { | ||
it('should handle null bytes', () => { | ||
const input = 'test\u0000string'; | ||
expect(sanitizeUTF8(input)).toBe('teststring'); | ||
}); | ||
|
||
it('should handle invalid UTF-8 sequences', () => { | ||
const input = 'test\uD800string'; // 未配对的代理项 | ||
expect(sanitizeUTF8(input)).toBe('teststring'); | ||
}); | ||
|
||
it('should handle invalid UTF-8 content', () => { | ||
const input = '\u0002\u0000\u0000\u0002�{\\"error\\":{\\"code\\":\\"resource_exhausted\\",'; | ||
expect(sanitizeUTF8(input)).toBe('{\\"error\\":{\\"code\\":\\"resource_exhausted\\",'); | ||
}); | ||
|
||
it('should preserve valid UTF-8 characters', () => { | ||
const input = '你好,世界!'; | ||
expect(sanitizeUTF8(input)).toBe('你好,世界!'); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
/** | ||
* Sanitize UTF-8 string to remove all control characters and invalid code points. | ||
* @param str | ||
*/ | ||
export const sanitizeUTF8 = (str: string) => { | ||
// 移除替换字符 (0xFFFD) 和其他非法字符 | ||
return ( | ||
str | ||
.replaceAll('�', '') // 移除 Unicode 替换字符 | ||
// eslint-disable-next-line no-control-regex | ||
.replaceAll(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, '') // 移除控制字符 | ||
.replaceAll(/[\uD800-\uDFFF]/g, '') | ||
); // 移除未配对的代理项码点 | ||
}; |