infiniflow · JinHai-CN · Jun 25, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 25, 2024
diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
@@ -14,36 +14,23 @@
 #  limitations under the License.
 
 
-import json
-import os
-import re
-from datetime import datetime, timedelta
-from flask import request, Response
+from flask import request
 from flask_login import login_required, current_user
 from httpx import HTTPError
 
-from api.db import FileType, ParserType, FileSource, StatusEnum
-from api.db.db_models import APIToken, API4Conversation, Task, File
+from api.contants import NAME_LENGTH_LIMIT
+from api.db import FileSource, StatusEnum
+from api.db.db_models import File
 from api.db.services import duplicate_name
-from api.db.services.api_service import APITokenService, API4ConversationService
-from api.db.services.dialog_service import DialogService, chat
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import queue_tasks, TaskService
-from api.db.services.user_service import UserTenantService, TenantService
-from api.settings import RetCode, retrievaler
-from api.utils import get_uuid, current_timestamp, datetime_format
-# from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request
-from itsdangerous import URLSafeTimedSerializer
-
-from api.utils.file_utils import filename_type, thumbnail
-from rag.utils.minio_conn import MINIO
-
-# import library
+from api.db.services.user_service import TenantService
+from api.settings import RetCode
+from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request
-from api.contants import NAME_LENGTH_LIMIT
+
 
 # ------------------------------ create a dataset ---------------------------------------
 

diff --git a/api/apps/documents_api.py b/api/apps/documents_api.py
@@ -0,0 +1,172 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License
+#
+
+import os
+import re
+import warnings
+
+from flask import request
+from flask_login import login_required, current_user
+
+from api.db import FileType, ParserType
+from api.db.services import duplicate_name
+from api.db.services.document_service import DocumentService
+from api.db.services.file_service import FileService
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.settings import RetCode
+from api.utils import get_uuid
+from api.utils.api_utils import construct_json_result
+from api.utils.file_utils import filename_type, thumbnail
+from rag.utils.minio_conn import MINIO
+
+
+MAXIMUM_OF_UPLOADING_FILES = 256
+
+
+# ----------------------------upload local files-----------------------------------------------------
+@manager.route('/<dataset_id>', methods=['POST'])
+@login_required
+def upload(dataset_id):
+    # no files
+    if not request.files:
+        return construct_json_result(
+            message='There is no file!', code=RetCode.ARGUMENT_ERROR)
+
+    # the number of uploading files exceeds the limit
+    file_objs = request.files.getlist('file')
+    num_file_objs = len(file_objs)
+
+    if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
+        return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
+                                                                      f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
+
+    for file_obj in file_objs:
+        # the content of the file
+        file_content = file_obj.read()
+        file_name = file_obj.filename
+        # no name
+        if not file_name:
+            return construct_json_result(
+                message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
+
+        # TODO: support the remote files
+        if 'http' in file_name:
+            return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
+
+        # the content is empty, raising a warning
+        if file_content == b'':
+            warnings.warn(f"[WARNING]: The file {file_name} is empty.")
+
+    # no dataset
+    exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
+    if not exist:
+        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
+
+    # get the root_folder
+    root_folder = FileService.get_root_folder(current_user.id)
+    # get the id of the root_folder
+    parent_file_id = root_folder["id"]  # document id
+    # this is for the new user, create '.knowledgebase' file
+    FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
+    # go inside this folder, get the kb_root_folder
+    kb_root_folder = FileService.get_kb_folder(current_user.id)
+    # link the file management to the kb_folder
+    kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
+
+    # grab all the errs
+    err = []
+    MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
+    for file in file_objs:
+        try:
+            # TODO: get this value from the database as some tenants have this limit while others don't
+            if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message="Exceed the maximum file number of a free user!")
+            # deal with the duplicate name
+            filename = duplicate_name(
+                DocumentService.query,
+                name=file.filename,
+                kb_id=dataset.id)
+
+            # deal with the unsupported type
+            filetype = filename_type(filename)
+            if filetype == FileType.OTHER.value:
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message="This type of file has not been supported yet!")
+
+            # upload to the minio
+            location = filename
+            while MINIO.obj_exist(dataset_id, location):
+                location += "_"
+            blob = file.read()
+            MINIO.put(dataset_id, location, blob)
+            doc = {
+                "id": get_uuid(),
+                "kb_id": dataset.id,
+                "parser_id": dataset.parser_id,
+                "parser_config": dataset.parser_config,
+                "created_by": current_user.id,
+                "type": filetype,
+                "name": filename,
+                "location": location,
+                "size": len(blob),
+                "thumbnail": thumbnail(filename, blob)
+            }
+            if doc["type"] == FileType.VISUAL:
+                doc["parser_id"] = ParserType.PICTURE.value
+            if re.search(r"\.(ppt|pptx|pages)$", filename):
+                doc["parser_id"] = ParserType.PRESENTATION.value
+            DocumentService.insert(doc)
+
+            FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
+        except Exception as e:
+            err.append(file.filename + ": " + str(e))
+
+    if err:
+        # return all the errors
+        return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
+    # success
+    return construct_json_result(data=True, code=RetCode.SUCCESS)
+
+# ----------------------------upload online files------------------------------------------------
+
+# ----------------------------download a file-----------------------------------------------------
+
+# ----------------------------delete a file-----------------------------------------------------
+
+# ----------------------------enable rename-----------------------------------------------------
+
+# ----------------------------list files-----------------------------------------------------
+
+# ----------------------------start parsing-----------------------------------------------------
+
+# ----------------------------stop parsing-----------------------------------------------------
+
+# ----------------------------show the status of the file-----------------------------------------------------
+
+# ----------------------------list the chunks of the file-----------------------------------------------------
+
+# ----------------------------delete the chunk-----------------------------------------------------
+
+# ----------------------------edit the status of the chunk-----------------------------------------------------
+
+# ----------------------------insert a new chunk-----------------------------------------------------
+
+# ----------------------------upload a file-----------------------------------------------------
+
+# ----------------------------get a specific chunk-----------------------------------------------------
+
+# ----------------------------retrieval test-----------------------------------------------------
diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py
@@ -13,20 +13,25 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import json
 import os
+
 import requests
-import json
+
+from api.settings import RetCode
 
 
 class RAGFlow:
     def __init__(self, user_key, base_url, version='v1'):
         '''
         api_url: http://<host_address>/api/v1
         dataset_url: http://<host_address>/api/v1/dataset
+        document_url: http://<host_address>/api/v1/documents
         '''
         self.user_key = user_key
         self.api_url = f"{base_url}/api/{version}"
         self.dataset_url = f"{self.api_url}/dataset"
+        self.document_url = f"{self.api_url}/documents"
         self.authorization_header = {"Authorization": "{}".format(self.user_key)}
 
     def create_dataset(self, dataset_name):
@@ -73,3 +78,54 @@ def update_dataset(self, dataset_name, **params):
         endpoint = f"{self.dataset_url}/{dataset_id}"
         response = requests.put(endpoint, json=params, headers=self.authorization_header)
         return response.json()
+
+# -------------------- content management -----------------------------------------------------
+
+    # ----------------------------upload local files-----------------------------------------------------
+    def upload_local_file(self, dataset_id, file_paths):
+        files = []
+
+        for file_path in file_paths:
+            if not isinstance(file_path, str):
+                return {'code': RetCode.ARGUMENT_ERROR, 'message': f"{file_path} is not string."}
+            if 'http' in file_path:
+                return {'code': RetCode.ARGUMENT_ERROR, 'message': "Remote files have not unsupported."}
+            if os.path.isfile(file_path):
+                files.append(('file', open(file_path, 'rb')))
+            else:
+                return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"}
+
+        res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files,
+                               headers=self.authorization_header)
+
+        result_dict = json.loads(res.text)
+        return result_dict
+
+    # ----------------------------upload remote files-----------------------------------------------------
+    # ----------------------------download a file-----------------------------------------------------
+
+    # ----------------------------delete a file-----------------------------------------------------
+
+    # ----------------------------enable rename-----------------------------------------------------
+
+    # ----------------------------list files-----------------------------------------------------
+
+    # ----------------------------start parsing-----------------------------------------------------
+
+    # ----------------------------stop parsing-----------------------------------------------------
+
+    # ----------------------------show the status of the file-----------------------------------------------------
+
+    # ----------------------------list the chunks of the file-----------------------------------------------------
+
+    # ----------------------------delete the chunk-----------------------------------------------------
+
+    # ----------------------------edit the status of the chunk-----------------------------------------------------
+
+    # ----------------------------insert a new chunk-----------------------------------------------------
+
+    # ----------------------------upload a file-----------------------------------------------------
+
+    # ----------------------------get a specific chunk-----------------------------------------------------
+
+    # ----------------------------retrieval test-----------------------------------------------------
diff --git a/sdk/python/test/test_data/.txt b/sdk/python/test/test_data/.txt
@@ -0,0 +1,2 @@
+hhh
+hhh
diff --git a/sdk/python/test/test_data/empty.txt b/sdk/python/test/test_data/empty.txt
diff --git a/sdk/python/test/test_data/test.txt b/sdk/python/test/test_data/test.txt
@@ -0,0 +1,3 @@
+test
+test
+test
diff --git a/sdk/python/test/test_data/test1.txt b/sdk/python/test/test_data/test1.txt
@@ -0,0 +1,2 @@
+test1
+test1