Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: completed delete_doc api #1290

Merged
merged 2 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 58 additions & 2 deletions api/apps/documents_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,16 @@
from api.db import FileType, ParserType
from api.db.services import duplicate_name
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.settings import RetCode
from api.utils import get_uuid
from api.utils.api_utils import construct_json_result
from api.utils.file_utils import filename_type, thumbnail
from rag.utils.minio_conn import MINIO
from api.db.db_models import Task, File
from api.db import FileType, TaskStatus, ParserType, FileSource


MAXIMUM_OF_UPLOADING_FILES = 256
Expand Down Expand Up @@ -89,6 +92,7 @@ def upload(dataset_id):
# grab all the errs
err = []
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
uploaded_docs_json = []
for file in file_objs:
try:
# TODO: get this value from the database as some tenants have this limit while others don't
Expand Down Expand Up @@ -132,21 +136,73 @@ def upload(dataset_id):
DocumentService.insert(doc)

FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
uploaded_docs_json.append(doc)
except Exception as e:
err.append(file.filename + ": " + str(e))

if err:
# return all the errors
return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
# success
return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)

# ----------------------------delete a file-----------------------------------------------------
@manager.route('/<dataset_id>/<document_id>', methods=['DELETE'])
@login_required
def delete(document_id, dataset_id): # string
# get the root folder
root_folder = FileService.get_root_folder(current_user.id)
# parent file's id
parent_file_id = root_folder["id"]
# consider the new user
FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
# store all the errors that may have
errors = ""
try:
# whether there is this document
exist, doc = DocumentService.get_by_id(document_id)
if not exist:
return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
# whether this doc is authorized by this tenant
tenant_id = DocumentService.get_tenant_id(document_id)
if not tenant_id:
return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization"
f" reason!", code=RetCode.AUTHENTICATION_ERROR)

# get the doc's id and location
real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)

if real_dataset_id != dataset_id:
return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)

# there is an issue when removing
if not DocumentService.remove_document(doc, tenant_id):
return construct_json_result(
cecilia-uu marked this conversation as resolved.
Show resolved Hide resolved
message="There was an error during the document removal process. Please check the status of the "
"RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)

# fetch the File2Document record associated with the provided document ID.
file_to_doc = File2DocumentService.get_by_document_id(document_id)
# delete the associated File record.
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
# delete the File2Document record itself using the document ID. This removes the
# association between the document and the file after the File record has been deleted.
File2DocumentService.delete_by_document_id(document_id)
cecilia-uu marked this conversation as resolved.
Show resolved Hide resolved

# delete it from minio
MINIO.rm(dataset_id, location)
except Exception as e:
errors += str(e)
if errors:
return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)

return construct_json_result(data=True, code=RetCode.SUCCESS)

# ----------------------------upload online files------------------------------------------------

# ----------------------------download a file-----------------------------------------------------

# ----------------------------delete a file-----------------------------------------------------

# ----------------------------enable rename-----------------------------------------------------

# ----------------------------list files-----------------------------------------------------
Expand Down
9 changes: 6 additions & 3 deletions sdk/python/ragflow/ragflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,13 @@ def upload_local_file(self, dataset_id, file_paths):
result_dict = json.loads(res.text)
return result_dict

# ----------------------------upload remote files-----------------------------------------------------
# ----------------------------download a file-----------------------------------------------------

# ----------------------------delete a file-----------------------------------------------------
def delete_files(self, document_id, dataset_id):
endpoint = f"{self.document_url}/{dataset_id}/{document_id}"
res = requests.delete(endpoint, headers=self.authorization_header)
return res.json()

# ----------------------------download a file-----------------------------------------------------

# ----------------------------enable rename-----------------------------------------------------

Expand Down
90 changes: 87 additions & 3 deletions sdk/python/test/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,95 @@ def test_upload_files_with_remote_file_path(self):
res = ragflow.upload_local_file(dataset_id, file_paths)
assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.'

# ----------------------------upload remote files-----------------------------------------------------
# ----------------------------delete a file-----------------------------------------------------
def test_delete_one_file(self):
"""
Test deleting one file with success.
"""
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_delete_one_file")
dataset_id = created_res['data']['dataset_id']
file_paths = ["test_data/test.txt"]
res = ragflow.upload_local_file(dataset_id, file_paths)
# get the doc_id
data = res['data'][0]
doc_id = data['id']
# delete the files
deleted_res = ragflow.delete_files(doc_id, dataset_id)
# assert value
assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True

# ----------------------------download a file-----------------------------------------------------
def test_delete_document_with_not_existing_document(self):
"""
Test deleting a document that does not exist with failure.
"""
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_delete_document_with_not_existing_document")
dataset_id = created_res['data']['dataset_id']
res = ragflow.delete_files("111", dataset_id)
assert res['code'] == RetCode.DATA_ERROR and res['message'] == 'Document 111 not found!'

# ----------------------------delete a file-----------------------------------------------------
def test_delete_document_with_creating_100_documents_and_deleting_100_documents(self):
"""
Test deleting documents when uploading 100 docs and deleting 100 docs.
"""
# upload 100 docs
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_delete_one_file")
dataset_id = created_res['data']['dataset_id']
file_paths = ["test_data/test.txt"] * 100
res = ragflow.upload_local_file(dataset_id, file_paths)

# get the doc_id
data = res['data']
for d in data:
doc_id = d['id']
# delete the files
deleted_res = ragflow.delete_files(doc_id, dataset_id)
# assert value
assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True

def test_delete_document_from_nonexistent_dataset(self):
"""
Test deleting documents from a non-existent dataset
"""
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_delete_one_file")
dataset_id = created_res['data']['dataset_id']
file_paths = ["test_data/test.txt"]
res = ragflow.upload_local_file(dataset_id, file_paths)
# get the doc_id
data = res['data'][0]
doc_id = data['id']
# delete the files
deleted_res = ragflow.delete_files(doc_id, "000")
# assert value
assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
f'The document {doc_id} is not in the dataset: 000, but in the dataset: {dataset_id}.')

def test_delete_document_which_is_located_in_other_dataset(self):
"""
Test deleting a document which is located in other dataset.
"""
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
# upload a document
created_res = ragflow.create_dataset("test_delete_document_which_is_located_in_other_dataset")
created_res_id = created_res['data']['dataset_id']
file_paths = ["test_data/test.txt"]
res = ragflow.upload_local_file(created_res_id, file_paths)
# other dataset
other_res = ragflow.create_dataset("other_dataset")
other_dataset_id = other_res['data']['dataset_id']
# get the doc_id
data = res['data'][0]
doc_id = data['id']
# delete the files from the other dataset
deleted_res = ragflow.delete_files(doc_id, other_dataset_id)
# assert value
assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')

# ----------------------------download a file-----------------------------------------------------

# ----------------------------enable rename-----------------------------------------------------
cecilia-uu marked this conversation as resolved.
Show resolved Hide resolved

Expand Down