Skip to content

Commit

Permalink
Infinity adapt to graphrag. (#4663)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
  • Loading branch information
KevinHuSh authored Jan 27, 2025
1 parent d970d0e commit 6f30397
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 10 deletions.
3 changes: 1 addition & 2 deletions conf/infinity_mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,11 @@
"pagerank_fea": {"type": "integer", "default": 0},
"tag_feas": {"type": "varchar", "default": ""},

"important_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"source_id": {"type": "varchar", "default": ""},
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"n_hop_with_weight": {"type": "varchar", "default": ""},
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"}
}
8 changes: 3 additions & 5 deletions deepdoc/parser/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,8 +956,6 @@ def __images__(self, fnm, zoomin=3, page_from=0,
fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])]
self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])]
try:
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
except Exception as e:
Expand Down Expand Up @@ -997,15 +995,15 @@ def dfs(arr, depth):
self.is_english = False

# st = timer()
for i, img in enumerate(self.page_images_x2):
for i, img in enumerate(self.page_images):
chars = self.page_chars[i] if not self.is_english else []
self.mean_height.append(
np.median(sorted([c["height"] for c in chars])) if chars else 0
)
self.mean_width.append(
np.median(sorted([c["width"] for c in chars])) if chars else 8
)
self.page_cum_height.append(img.size[1] / zoomin/2)
self.page_cum_height.append(img.size[1] / zoomin)
j = 0
while j + 1 < len(chars):
if chars[j]["text"] and chars[j + 1]["text"] \
Expand All @@ -1015,7 +1013,7 @@ def dfs(arr, depth):
chars[j]["text"] += " "
j += 1

self.__ocr(i + 1, img, chars, zoomin*2)
self.__ocr(i + 1, img, chars, zoomin)
if callback and i % 6 == 5:
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
# print("OCR:", timer()-st)
Expand Down
6 changes: 3 additions & 3 deletions graphrag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def set_entity(tenant_id, kb_id, embd_mdl, ent_name, meta):
logging.exception(f"Fail to embed entity: {e}")
if ebd is not None:
chunk["q_%d_vec" % len(ebd)] = ebd
settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id))
settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id)


def get_relation(tenant_id, kb_id, from_ent_name, to_ent_name, size=1):
Expand Down Expand Up @@ -347,7 +347,7 @@ def set_relation(tenant_id, kb_id, embd_mdl, from_ent_name, to_ent_name, meta):
logging.exception(f"Fail to embed entity relation: {e}")
if ebd is not None:
chunk["q_%d_vec" % len(ebd)] = ebd
settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id))
settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id)


def get_graph(tenant_id, kb_id):
Expand Down Expand Up @@ -382,7 +382,7 @@ def set_graph(tenant_id, kb_id, graph, docids):
settings.docStoreConn.update({"knowledge_graph_kwd": "graph"}, chunk,
search.index_name(tenant_id), kb_id)
else:
settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id))
settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id)


def is_continuous_subsequence(subseq, seq):
Expand Down

0 comments on commit 6f30397

Please sign in to comment.