Skip to content

Commit

Permalink
fix plainPdf bugs (infiniflow#152)
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinHuSh authored Mar 26, 2024
1 parent 75f7c6d commit da21320
Show file tree
Hide file tree
Showing 13 changed files with 36 additions and 33 deletions.
12 changes: 7 additions & 5 deletions api/apps/conversation_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,7 @@ def chat(dialog, messages, **kwargs):
## try to use sql if field mapping is good to go
if field_map:
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
markdown_tbl, chunks = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)
if markdown_tbl:
return {"answer": markdown_tbl, "reference": {"chunks": chunks, "doc_aggs": []}}
return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl)

prompt_config = dialog.prompt_config
for p in prompt_config["parameters"]:
Expand Down Expand Up @@ -311,7 +309,7 @@ def get_table():
clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]

# compose markdown table
clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|")
line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
if not docid_idx or not docnm_idx:
Expand All @@ -322,4 +320,8 @@ def get_table():
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
docid_idx = list(docid_idx)[0]
docnm_idx = list(docnm_idx)[0]
return "\n".join([clmns, line, rows]), [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]]
return {
"answer": "\n".join([clmns, line, rows]),
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
"doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
}
2 changes: 1 addition & 1 deletion deepdoc/parser/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,7 +996,7 @@ def crop(self, text, ZM=3, need_position=False):
if need_position: return None, None
return

max_width = np.max([right - left for (_, left, right, _, _) in poss])
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
Expand Down
6 changes: 1 addition & 5 deletions deepdoc/vision/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,13 @@
import copy
import time
import os

from huggingface_hub import snapshot_download

from .operators import *
import numpy as np
import onnxruntime as ort

from .postprocess import build_post_process
from rag.settings import cron_logger


def transform(data, ops=None):
""" transform """
if ops is None:
Expand Down Expand Up @@ -82,7 +78,7 @@ def __init__(self, model_dir):
self.rec_batch_num = 16
postprocess_params = {
'name': 'CTCLabelDecode',
"character_dict_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "ocr.res"),
"character_dict_path": os.path.join(model_dir, "ocr.res"),
"use_space_char": True
}
self.postprocess_op = build_post_process(postprocess_params)
Expand Down
4 changes: 3 additions & 1 deletion deepdoc/vision/table_structure_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from collections import Counter

import numpy as np
from huggingface_hub import snapshot_download

from api.utils.file_utils import get_project_base_directory
from rag.nlp import huqie
Expand All @@ -33,7 +34,8 @@ class TableStructureRecognizer(Recognizer):
]

def __init__(self):
super().__init__(self.labels, "tsr",os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))

def __call__(self, images, thr=0.2):
tbls = super().__call__(images, thr)
Expand Down
4 changes: 2 additions & 2 deletions rag/app/laws.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __call__(self, filename, binary=None, from_page=0,

callback(0.8, "Text extraction finished")

return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None


def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
Expand All @@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
for txt, poss in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback):
from_page=from_page, to_page=to_page, callback=callback)[0]:
sections.append(txt + poss)

elif re.search(r"\.txt$", filename, re.IGNORECASE):
Expand Down
2 changes: 1 addition & 1 deletion rag/app/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
"title": filename,
"authors": " ",
"abstract": "",
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page),
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
"tables": []
}
else:
Expand Down
2 changes: 1 addition & 1 deletion rag/app/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3,

class PlainPdf(PlainParser):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
self.pdf = pdf2_read(filename if not binary else BytesIO(filename))
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
page_txt = []
for page in self.pdf.pages[from_page: to_page]:
page_txt.append(page.extract_text())
Expand Down
2 changes: 1 addition & 1 deletion rag/app/resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def remote_call(filename, binary):
"encrypt_type": "base64",
"filename": filename,
"langtype": '',
"fileori": base64.b64encode(binary.stream.read()).decode('utf-8')
"fileori": base64.b64encode(binary).decode('utf-8')
},
"c": "resume_parse_module",
"m": "resume_parse"
Expand Down
4 changes: 2 additions & 2 deletions rag/llm/embedding_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
from FlagEmbedding import FlagModel
import torch
import numpy as np

from huggingface_hub import snapshot_download
from rag.utils import num_tokens_from_string

flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available())

Expand Down
4 changes: 2 additions & 2 deletions rag/nlp/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def question(self, txt, tbl="qa", min_match="60%"):

if not self.isChinese(txt):
tks = huqie.qie(txt).split(" ")
q = tks
q = copy.deepcopy(tks)
for i in range(1, len(tks)):
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
if not q:
Expand Down Expand Up @@ -138,7 +138,7 @@ def hybrid_similarity(self, avec, bvecs, atks, btkss, tkweight=0.3,

def toDict(tks):
d = {}
if isinstance(tks, type("")):
if isinstance(tks, str):
tks = tks.split(" ")
for t, c in self.tw.weights(tks):
if t not in d:
Expand Down
4 changes: 2 additions & 2 deletions rag/nlp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,13 +234,13 @@ def insert_citations(self, answer, chunks, chunk_v,
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
len(ans_v[0]), len(chunk_v[0]))

chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks]
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
cites = {}
for i, a in enumerate(pieces_):
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
chunk_v,
huqie.qie(
pieces_[i]).split(" "),
self.qryr.rmWWW(pieces_[i])).split(" "),
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
Expand Down
19 changes: 10 additions & 9 deletions rag/nlp/term_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,10 @@ def skill(t):
return 6

def ner(t):
if re.match(r"[0-9,.]{2,}$", t): return 2
if re.match(r"[a-z]{1,2}$", t): return 0.01
if not self.ne or t not in self.ne:
return 1
if re.match(r"[0-9,.]+$", t): return 2
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
"firstnm": 1}
return m[self.ne[t]]
Expand All @@ -170,11 +171,11 @@ def postag(t):
return 1

def freq(t):
if re.match(r"[0-9\. -]+$", t):
return 10000
if re.match(r"[0-9. -]{2,}$", t):
return 3
s = huqie.freq(t)
if not s and re.match(r"[a-z\. -]+$", t):
return 10
if not s and re.match(r"[a-z. -]+$", t):
return 300
if not s:
s = 0

Expand All @@ -188,12 +189,12 @@ def freq(t):
return max(s, 10)

def df(t):
if re.match(r"[0-9\. -]+$", t):
return 100000
if re.match(r"[0-9. -]{2,}$", t):
return 5
if t in self.df:
return self.df[t] + 3
elif re.match(r"[a-z\. -]+$", t):
return 3
elif re.match(r"[a-z. -]+$", t):
return 300
elif len(t) >= 4:
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
if len(s) > 1:
Expand Down
4 changes: 3 additions & 1 deletion rag/svr/task_broker.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ def new_task():
if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
if r["parser_id"] == "one": page_size = 1000000000
if not do_layout: page_size = 1000000000
for s,e in r["parser_config"].get("pages", [(1, 100000)]):
page_ranges = r["parser_config"].get("pages")
if not page_ranges: page_ranges = [(1, 100000)]
for s,e in page_ranges:
s -= 1
s = max(0, s)
e = min(e-1, pages)
Expand Down

0 comments on commit da21320

Please sign in to comment.