Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for custom Q&A in the knowledge base #10873 #10874

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions api/core/indexing_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,12 @@ def indexing_estimate(
if doc_form and doc_form == "qa_model":
if len(preview_texts) > 0:
# qa model document
response = LLMGenerator.generate_qa_document(
current_user.current_tenant_id, preview_texts[0], doc_language
)
if "Q00001:" in preview_texts[0] and "A00001:" in preview_texts[0]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one doesn't seem very generic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the separator? "Q00001:" and "A00001:"? I can change to a common separator.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image
image
I refer to the format_split_text method。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is only a runtime delimiter variable and does not actually store

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

response = preview_texts[0]
else:
response = LLMGenerator.generate_qa_document(
current_user.current_tenant_id, preview_texts[0], doc_language
)
document_qa_list = self.format_split_text(response)

return {"total_segments": total_segments * 20, "qa_preview": document_qa_list, "preview": preview_texts}
Expand Down Expand Up @@ -501,7 +504,10 @@ def _split_to_documents(
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
document_node.page_content = remove_leading_symbols(page_content)
if "Q00001:" in page_content and "A00001:" in page_content:
document_node.page_content = page_content
else:
document_node.page_content = remove_leading_symbols(page_content)

if document_node.page_content:
split_documents.append(document_node)
Expand Down Expand Up @@ -536,7 +542,12 @@ def format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, al
with flask_app.app_context():
try:
# qa model document
response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content, document_language)
if "Q00001:" in document_node.page_content and "A00001:" in document_node.page_content:
response = document_node.page_content
else:
response = LLMGenerator.generate_qa_document(
tenant_id, document_node.page_content, document_language
)
document_qa_list = self.format_split_text(response)
qa_documents = []
for result in document_qa_list:
Expand Down
7 changes: 6 additions & 1 deletion api/core/rag/extractor/csv_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ def __init__(
autodetect_encoding: bool = False,
source_column: Optional[str] = None,
csv_args: Optional[dict] = None,
document_model: Optional[str] = None,
):
"""Initialize with file path."""
self._file_path = file_path
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
self.source_column = source_column
self.csv_args = csv_args or {}
self.document_model = document_model

def extract(self) -> list[Document]:
"""Load data into document objects."""
Expand Down Expand Up @@ -67,7 +69,10 @@ def _read_from_file(self, csvfile) -> list[Document]:
# create document objects

for i, row in df.iterrows():
content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
if len(df.columns) == 2 and "qa_model" == self.document_model:
content = f"Q00001:{str(row[0]).strip()}\nA00001:{str(row[1]).strip()}"
else:
content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
source = row[self.source_column] if self.source_column else ""
metadata = {"source": source, "row": i}
doc = Document(page_content=content, metadata=metadata)
Expand Down
59 changes: 37 additions & 22 deletions api/core/rag/extractor/excel_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ class ExcelExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, encoding: Optional[str] = None, autodetect_encoding: bool = False):
def __init__(
self,
file_path: str,
encoding: Optional[str] = None,
autodetect_encoding: bool = False,
document_model: Optional[str] = None,
):
"""Initialize with file path."""
self._file_path = file_path
self._encoding = encoding
self._autodetect_encoding = autodetect_encoding
self.document_model = document_model

def extract(self) -> list[Document]:
"""Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
Expand All @@ -43,20 +50,24 @@ def extract(self) -> list[Document]:
df.dropna(how="all", inplace=True)

for index, row in df.iterrows():
page_content = []
for col_index, (k, v) in enumerate(row.items()):
if pd.notna(v):
cell = sheet.cell(
row=index + 2, column=col_index + 1
) # +2 to account for header and 1-based index
if cell.hyperlink:
value = f"[{v}]({cell.hyperlink.target})"
page_content.append(f'"{k}":"{value}"')
else:
page_content.append(f'"{k}":"{v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)
if len(df.columns) == 2 and "qa_model" == self.document_model:
content = f"Q00001:{str(row[0]).strip()}\nA00001:{str(row[1]).strip()}"
documents.append(Document(page_content=content, metadata={"source": self._file_path}))
else:
page_content = []
for col_index, (k, v) in enumerate(row.items()):
if pd.notna(v):
cell = sheet.cell(
row=index + 2, column=col_index + 1
) # +2 to account for header and 1-based index
if cell.hyperlink:
value = f"[{v}]({cell.hyperlink.target})"
page_content.append(f'"{k}":"{value}"')
else:
page_content.append(f'"{k}":"{v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)

elif file_extension == ".xls":
excel_file = pd.ExcelFile(self._file_path, engine="xlrd")
Expand All @@ -65,13 +76,17 @@ def extract(self) -> list[Document]:
df.dropna(how="all", inplace=True)

for _, row in df.iterrows():
page_content = []
for k, v in row.items():
if pd.notna(v):
page_content.append(f'"{k}":"{v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)
if len(df.columns) == 2 and "qa_model" == self.document_model:
content = f"Q00001:{str(row[0]).strip()}\nA00001:{str(row[1]).strip()}"
documents.append(Document(page_content=content, metadata={"source": self._file_path}))
else:
page_content = []
for k, v in row.items():
if pd.notna(v):
page_content.append(f'"{k}":"{v}"')
documents.append(
Document(page_content=";".join(page_content), metadata={"source": self._file_path})
)
else:
raise ValueError(f"Unsupported file extension: {file_extension}")

Expand Down
6 changes: 4 additions & 2 deletions api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def extract(
)
else:
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
extractor = ExcelExtractor(file_path, document_model=extract_setting.document_model)
elif file_extension == ".pdf":
extractor = PdfExtractor(file_path)
elif file_extension in {".md", ".markdown"}:
Expand All @@ -148,7 +148,9 @@ def extract(
elif file_extension == ".docx":
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True)
extractor = CSVExtractor(
file_path, autodetect_encoding=True, document_model=extract_setting.document_model
)
elif file_extension == ".epub":
extractor = UnstructuredEpubExtractor(file_path)
else:
Expand Down
13 changes: 11 additions & 2 deletions api/core/rag/index_processor/processor/qa_index_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]:
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
document_node.page_content = remove_leading_symbols(page_content)
if "Q00001:" in page_content and "A00001:" in page_content:
document_node.page_content = page_content
else:
document_node.page_content = remove_leading_symbols(page_content)
split_documents.append(document_node)
all_documents.extend(split_documents)
for i in range(0, len(all_documents), 10):
Expand Down Expand Up @@ -143,7 +146,13 @@ def _format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, a
with flask_app.app_context():
try:
# qa model document
response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content, document_language)

if "Q00001:" in document_node.page_content and "A00001:" in document_node.page_content:
response = document_node.page_content
else:
response = LLMGenerator.generate_qa_document(
tenant_id, document_node.page_content, document_language
)
document_qa_list = self._format_split_text(response)
qa_documents = []
for result in document_qa_list:
Expand Down
7 changes: 6 additions & 1 deletion api/core/rag/splitter/text_splitter.py
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image
image

I think u shouldn't modify function create_documents, instead of that, create a separate splitter QASplitter

Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,12 @@ def create_documents(self, texts: list[str], metadatas: Optional[list[dict]] = N
documents = []
for i, text in enumerate(texts):
index = -1
for chunk in self.split_text(text):

if "Q00001:" in text and "A00001:" in text:
split_text_arr = [text]
else:
split_text_arr = self.split_text(text)
for chunk in split_text_arr:
metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index:
index = text.find(chunk, index + 1)
Expand Down