From 5fbb402477c41e09f56a3e5adc32f316341772bf Mon Sep 17 00:00:00 2001 From: Dmitri Qiu Date: Wed, 25 Sep 2024 13:00:03 +0300 Subject: [PATCH] fix: Sanitize null bytes before ingestion (#2090) * Sanitize null bytes before ingestion * Added comments --- private_gpt/components/ingest/ingest_helper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index a11090702..da62568bc 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -92,7 +92,13 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]: return string_reader.load_data([file_data.read_text()]) logger.debug("Specific reader found for extension=%s", extension) - return reader_cls().load_data(file_data) + documents = reader_cls().load_data(file_data) + + # Sanitize NUL bytes in text which can't be stored in Postgres + for i in range(len(documents)): + documents[i].text = documents[i].text.replace("\u0000", "") + + return documents @staticmethod def _exclude_metadata(documents: list[Document]) -> None: