Skip to content

Commit

Permalink
support snapshot download from local (infiniflow#153)
Browse files Browse the repository at this point in the history
* support snapshot download from local

* let snapshot download from local
  • Loading branch information
KevinHuSh authored Mar 27, 2024
1 parent da21320 commit 979b3a5
Show file tree
Hide file tree
Showing 12 changed files with 109 additions and 24 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<div align="center">
<a href="https://ragflow.io/">
<a href="https://demo.ragflow.io/">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
</a>
</div>
Expand All @@ -11,7 +11,7 @@
</p>

<p align="center">
<a href="https://ragflow.io" target="_blank">
<a href="https://demo.ragflow.io" target="_blank">
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
<img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
Expand All @@ -21,7 +21,7 @@
</a>
</p>

[RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
[RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
platform to empower your business with AI.

Expand Down Expand Up @@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again!
> The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
> and change the left part of *'80:80'*'.
# System Architecture Diagram

<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/39c8e546-51ca-4b50-a1da-83731b540cd0" width="1000"/>
</div>

# Configuration
If you need to change the default setting of the system when you deploy it. There several ways to configure it.
Please refer to [README](./docker/README.md) and manually set the configuration.
Expand Down
7 changes: 6 additions & 1 deletion api/apps/conversation_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,13 @@ def get_table():
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
docid_idx = list(docid_idx)[0]
docnm_idx = list(docnm_idx)[0]
doc_aggs = {}
for r in tbl["rows"]:
if r[docid_idx] not in doc_aggs:
doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0}
doc_aggs[r[docid_idx]]["count"] += 1
return {
"answer": "\n".join([clmns, line, rows]),
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
"doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
"doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]}
}
18 changes: 15 additions & 3 deletions deepdoc/parser/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import os
import random

import fitz
Expand All @@ -12,10 +13,12 @@
import numpy as np

from PyPDF2 import PdfReader as pdf2_read

from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie
from copy import deepcopy
from huggingface_hub import hf_hub_download
from huggingface_hub import hf_hub_download, snapshot_download

logging.getLogger("pdfminer").setLevel(logging.WARNING)

Expand All @@ -32,8 +35,17 @@ def __init__(self):
self.updown_cnt_mdl = xgb.Booster()
if torch.cuda.is_available():
self.updown_cnt_mdl.set_param({"device": "cuda"})
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
filename="updown_concat_xgb.model"))
try:
model_dir = snapshot_download(
repo_id="InfiniFlow/text_concat_xgb_v1.0",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")

self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
self.page_from = 0
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
Expand Down
11 changes: 10 additions & 1 deletion deepdoc/vision/layout_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer):
"Equation",
]
def __init__(self, domain):
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")

super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
self.garbage_layouts = ["footer", "header", "reference"]

Expand Down
40 changes: 31 additions & 9 deletions deepdoc/vision/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,18 @@
import copy
import time
import os

from huggingface_hub import snapshot_download

from api.utils.file_utils import get_project_base_directory
from .operators import *
import numpy as np
import onnxruntime as ort

from .postprocess import build_post_process
from rag.settings import cron_logger


def transform(data, ops=None):
""" transform """
if ops is None:
Expand Down Expand Up @@ -66,9 +71,15 @@ def load_model(model_dir, nm):
options.intra_op_num_threads = 2
options.inter_op_num_threads = 2
if False and ort.get_device() == "GPU":
sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CUDAExecutionProvider'])
else:
sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CPUExecutionProvider'])
return sess, sess.get_inputs()[0]


Expand Down Expand Up @@ -331,7 +342,8 @@ def __call__(self, img_list):
outputs = self.predictor.run(None, input_dict)
break
except Exception as e:
if i >= 3: raise e
if i >= 3:
raise e
time.sleep(5)
preds = outputs[0]
rec_result = self.postprocess_op(preds)
Expand Down Expand Up @@ -442,7 +454,8 @@ def __call__(self, img):
outputs = self.predictor.run(None, input_dict)
break
except Exception as e:
if i >= 3: raise e
if i >= 3:
raise e
time.sleep(5)

post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
Expand All @@ -466,7 +479,15 @@ def __init__(self, model_dir=None):
"""
if not model_dir:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")

self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir)
Expand Down Expand Up @@ -548,14 +569,16 @@ def detect(self, img):
cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
len(dt_boxes), elapse))

return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))])
return zip(self.sorted_boxes(dt_boxes), [
("", 0) for _ in range(len(dt_boxes))])

def recognize(self, ori_im, box):
img_crop = self.get_rotate_crop_image(ori_im, box)

rec_res, elapse = self.text_recognizer([img_crop])
text, score = rec_res[0]
if score < self.drop_score:return ""
if score < self.drop_score:
return ""
return text

def __call__(self, img, cls=True):
Expand Down Expand Up @@ -600,8 +623,7 @@ def __call__(self, img, cls=True):
end = time.time()
time_dict['all'] = end - start


#for bno in range(len(img_crop_list)):
# for bno in range(len(img_crop_list)):
# print(f"{bno}, {rec_res[bno]}")

return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
11 changes: 10 additions & 1 deletion deepdoc/vision/recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import onnxruntime as ort
from huggingface_hub import snapshot_download

from api.utils.file_utils import get_project_base_directory
from .operators import *
from rag.settings import cron_logger

Expand All @@ -35,7 +36,15 @@ def __init__(self, label_list, task_name, model_dir=None):
"""
if not model_dir:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")

model_file_path = os.path.join(model_dir, task_name + ".onnx")
if not os.path.exists(model_file_path):
Expand Down
11 changes: 10 additions & 1 deletion deepdoc/vision/table_structure_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer):
]

def __init__(self):
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")

super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))

def __call__(self, images, thr=0.2):
Expand Down
2 changes: 1 addition & 1 deletion docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until
Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.

### factory
The LLM suppliers. '通义千问', "OpenAI" and "智谱AI" are supported.
The LLM suppliers. 'Tongyi-Qianwen', "OpenAI", "Moonshot" and "ZHIPU-AI" are supported.

### api_key
The corresponding API key of your assigned LLM vendor.
Expand Down
2 changes: 1 addition & 1 deletion docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ function task_bro(){

task_bro &

WS=8
WS=2
for ((i=0;i<WS;i++))
do
task_exe $i $WS &
Expand Down
2 changes: 1 addition & 1 deletion docker/service_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ minio:
es:
hosts: 'http://es01:9200'
user_default_llm:
factory: '通义千问'
factory: 'Tongyi-Qianwen'
api_key: 'sk-xxxxxxxxxxxxx'
oauth:
github:
Expand Down
15 changes: 14 additions & 1 deletion rag/llm/embedding_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
from abc import ABC

import dashscope
Expand All @@ -21,9 +22,21 @@
import torch
import numpy as np
from huggingface_hub import snapshot_download

from api.utils.file_utils import get_project_base_directory
from rag.utils import num_tokens_from_string

flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
try:
model_dir = snapshot_download(
repo_id="BAAI/bge-large-zh-v1.5",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/bge-large-zh-v1.5"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")

flag_model = FlagModel(model_dir,
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available())

Expand Down
2 changes: 1 addition & 1 deletion rag/svr/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def init_kb(row):
def embedding(docs, mdl, parser_config={}, callback=None):
batch_size = 32
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
d["content_with_weight"] for d in docs]
re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
tk_count = 0
if len(tts) == len(cnts):
tts_ = np.array([])
Expand Down

0 comments on commit 979b3a5

Please sign in to comment.