Skip to content

Commit 0c4300e

Browse files
authored
feat(service): remove langchain to add multimodal embedder (#326)
* feat(service): add multimodal embedder * feat(primitive): remove langchain and re-implement class
1 parent 552a6d5 commit 0c4300e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2078
-999
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,4 @@ evaluation/rejection/gt_bad.txt
5656
evaluation/rejection/gt_good.txt
5757
workdir832/
5858
workdir.bak/
59+
workdir-20240729-kg-included/

README.md

+141-136
Large diffs are not rendered by default.

README_zh.md

+94-87
Large diffs are not rendered by default.

config-2G.ini

+103-12
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ work_dir = "workdir"
99

1010
[web_search]
1111
engine = "serper"
12-
# web search engine support `ddgs` and `serper`
12+
# web search engine support ddgs and serper
1313
# For ddgs, see https://pypi.org/project/duckduckgo-search
1414
# For serper, check https://serper.dev/api-key to get a free API key
1515
serper_x_api_key = "YOUR-API-KEY-HERE"
@@ -20,7 +20,7 @@ save_dir = "logs/web_search_result"
2020
enable_local = 0
2121
enable_remote = 1
2222
# hybrid llm service address
23-
client_url = "http://127.0.0.1:7777/inference"
23+
client_url = "http://127.0.0.1:8888/inference"
2424

2525
[llm.server]
2626
# local LLM configuration
@@ -30,33 +30,44 @@ client_url = "http://127.0.0.1:7777/inference"
3030

3131
local_llm_path = "internlm/internlm2-chat-7b"
3232
local_llm_max_text_length = 3000
33-
local_llm_bind_port = 7777
33+
# llm server listen port
34+
local_llm_bind_port = 8888
3435

3536
# remote LLM service configuration
36-
# support "gpt", "kimi", "deepseek", "zhipuai", "step", "xi-api" and "alles-apin"
37+
# support "gpt", "kimi", "deepseek", "zhipuai", "step", "internlm", "xi-api" and "alles-apin"
38+
# support "siliconcloud", see https://siliconflow.cn/zh-cn/siliconcloud
3739
# xi-api and alles-apin is chinese gpt proxy
40+
# for internlm, see https://internlm.intern-ai.org.cn/api/document
3841

39-
remote_type = "kimi"
42+
remote_type = "siliconcloud"
4043
remote_api_key = "YOUR-API-KEY-HERE"
4144
# max text length for remote LLM.
42-
# use 128000 for kimi, 192000 for gpt/xi-api, 16000 for deepseek, 128000 for zhipuai
43-
remote_llm_max_text_length = 128000
45+
# use 128000 for kimi, 192000 for gpt/xi-api, 16000 for deepseek, 128000 for zhipuai, 40000 for internlm2
46+
remote_llm_max_text_length = 40000
4447
# openai API model type, support model list:
4548
# "auto" for kimi. To save money, we auto select model name by prompt length.
46-
# "auto" for step to save money, see https://platform.stepfun.com
49+
# "auto" for step to save money, see https://platform.stepfun.com/
4750
# "gpt-4-0613" for gpt/xi-api,
4851
# "deepseek-chat" for deepseek,
4952
# "glm-4" for zhipuai,
5053
# "gpt-4-1106-preview" for alles-apin or OpenAOE
51-
remote_llm_model = "auto"
54+
# "internlm2-latest" for internlm
55+
# for example "alibaba/Qwen1.5-110B-Chat", see https://siliconflow.readme.io/reference/chat-completions-1
56+
remote_llm_model = "alibaba/Qwen1.5-110B-Chat"
5257
# request per minute
5358
rpm = 500
5459

60+
[coreference_resolution]
61+
base_url = 'http://127.0.0.1:9999/v1'
62+
api_key = 'token-abc123'
63+
5564
[worker]
5665
# enable web search or not
5766
enable_web_search = 1
5867
# enable search enhancement or not
5968
enable_sg_search = 0
69+
# enable coreference resolution in `PreprocNode`
70+
enable_cr = 0
6071
save_path = "logs/work.txt"
6172

6273
[worker.time]
@@ -68,7 +79,7 @@ has_weekday = 1
6879
[sg_search]
6980
# download `src` from https://github.com/sourcegraph/src-cli#installation
7081
binary_src_path = "/usr/local/bin/src"
71-
src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
82+
src_access_token = "YOUR-SRC-ACCESS-TOKEN"
7283

7384
# add your repo here, we just take opencompass and lmdeploy as example
7485
[sg_search.opencompass]
@@ -80,12 +91,41 @@ introduction = "用于评测大型语言模型(LLM). 它提供了完整的
8091
github_repo_id = "internlm/lmdeploy"
8192
introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
8293
# introduction = "lmdeploy is a toolkit for compressing, deploying, and servicing Large Language Models (LLMs). It is a deployment tool for transformer-structured LLMs in server-side scenarios, supporting GPU server-side deployment, ensuring speed, and supporting Tensor Parallel along with optimizations for multiple concurrent processes. It offers comprehensive features including model conversion, cache features for caching historical sessions and more. Additionally, it provides access via WebUI, command line, and gRPC clients."
94+
# add your repo here, we just take opencompass and lmdeploy as example
95+
96+
[sg_search.mmpose]
97+
github_repo_id = "open-mmlab/mmpose"
98+
introduction = "MMPose is an open-source toolbox for pose estimation based on PyTorch"
99+
100+
[sg_search.mmdetection]
101+
github_repo_id = "open-mmlab/mmdetection"
102+
introduction = "MMDetection is an open source object detection toolbox based on PyTorch."
103+
104+
[sg_search.huixiangdou]
105+
github_repo_id = "internlm/huixiangdou"
106+
introduction = "茴香豆是一个基于 LLM 的群聊知识助手。设计拒答、响应两阶段 pipeline 应对群聊场景,解答问题同时不会消息泛滥。"
107+
108+
[sg_search.xtuner]
109+
github_repo_id = "internlm/xtuner"
110+
introduction = "XTuner is an efficient, flexible and full-featured toolkit for fine-tuning large models."
111+
112+
[sg_search.mmyolo]
113+
github_repo_id = "open-mmlab/mmyolo"
114+
introduction = "OpenMMLab YOLO series toolbox and benchmark. Implemented RTMDet, RTMDet-Rotated,YOLOv5, YOLOv6, YOLOv7, YOLOv8,YOLOX, PPYOLOE, etc."
115+
116+
[sg_search.Amphion]
117+
github_repo_id = "open-mmlab/Amphion"
118+
introduction = "Amphion is a toolkit for Audio, Music, and Speech Generation. Its purpose is to support reproducible research and help junior researchers and engineers get started in the field of audio, music, and speech generation research and development."
119+
120+
[sg_search.mmcv]
121+
github_repo_id = "open-mmlab/mmcv"
122+
introduction = "MMCV is a foundational library for computer vision research and it provides image/video processing, image and annotation visualization, image transformation, various CNN architectures and high-quality implementation of common CPU and CUDA ops"
83123

84124
[frontend]
85-
# chat group assistant type, support "lark", "lark_group", "wechat_personal" and "none"
86-
# for "lark", open https://open.feishu.cn/document/client-docs/bot-v3/add-custom-bot to add bot, **only send, cannot receive**
125+
# chat group assistant type, support "lark_group", "wechat_personal", "wechat_wkteam" and "none"
87126
# for "lark_group", open https://open.feishu.cn/document/home/introduction-to-custom-app-development/self-built-application-development-process to create one
88127
# for "wechat_personal", read ./docs/add_wechat_group_zh.md to setup gateway
128+
# for "wkteam", see https://wkteam.cn/
89129
type = "none"
90130

91131
# for "lark", it is chat group webhook url, send reply to group, for example "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
@@ -107,3 +147,54 @@ verification_token = "def"
107147
[frontend.wechat_personal]
108148
# "wechat_personal" listen port
109149
bind_port = 9527
150+
151+
[frontend.wechat_wkteam]
152+
# wechat message callback server ip
153+
callback_ip = "101.133.161.11"
154+
callback_port = 9528
155+
156+
# public redis config
157+
redis_host = "101.133.161.11"
158+
redis_port = "6380"
159+
redis_passwd = "hxd123"
160+
161+
# wkteam
162+
account = ""
163+
password = ""
164+
# !!! `proxy` is very import parameter, it's your account location
165+
# 1:北京 2:天津 3:上海 4:重庆 5:河北
166+
# 6:山西 7:江苏 8:浙江 9:安徽 10:福建
167+
# 11:江西 12:山东 13:河南 14:湖北 15:湖南
168+
# 16:广东 17:海南 18:四川 20:陕西
169+
# bad proxy would cause account deactivation !!!
170+
proxy = -1
171+
172+
# save dir
173+
dir = "wkteam"
174+
175+
# 群号和介绍
176+
# 茴香豆相关
177+
[frontend.wechat_wkteam.43925126702]
178+
name = "茴香豆群(大暑)"
179+
introduction = "github https://github.com/InternLM/HuixiangDou 用户体验群"
180+
181+
[frontend.wechat_wkteam.44546611710]
182+
name = "茴香豆群(立夏)"
183+
introduction = "github https://github.com/InternLM/HuixiangDou 用户体验群"
184+
185+
[frontend.wechat_wkteam.38720590618]
186+
name = "茴香豆群(惊蛰)"
187+
introduction = "github https://github.com/InternLM/HuixiangDou 用户体验群"
188+
189+
[frontend.wechat_wkteam.48437885473]
190+
name = "茴香豆群(谷雨)"
191+
introduction = "github https://github.com/InternLM/HuixiangDou 用户体验群"
192+
193+
[frontend.wechat_wkteam.34744063953]
194+
name = "茴香豆群(雨水)"
195+
introduction = "github https://github.com/InternLM/HuixiangDou 用户体验群"
196+
197+
# github.com/tencent/ncnn contributors
198+
[frontend.wechat_wkteam.18356748488]
199+
name = "卷卷群"
200+
introduction = "ncnn contributors group"

config-multimodal.ini

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
[feature_store]
2+
# `feature_store.py` use this throttle to distinct `good_questions` and `bad_questions`
3+
reject_throttle = -1.0
4+
# text2vec model path, support local relative path and huggingface model format.
5+
# also support local path, model_path = "/path/to/your/text2vec-model"
6+
embedding_model_path = "BAAI/bge-m3"
7+
reranker_model_path = "BAAI/bge-reranker-v2-minicpm-layerwise"
8+
work_dir = "workdir"
9+
10+
[web_search]
11+
engine = "serper"
12+
# web search engine support `ddgs` and `serper`
13+
# For ddgs, see https://pypi.org/project/duckduckgo-search
14+
# For serper, check https://serper.dev/api-key to get a free API key
15+
serper_x_api_key = "YOUR-API-KEY-HERE"
16+
domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
17+
save_dir = "logs/web_search_result"
18+
19+
[llm]
20+
enable_local = 0
21+
enable_remote = 1
22+
# hybrid llm service address
23+
client_url = "http://127.0.0.1:7777/inference"
24+
25+
[llm.server]
26+
# local LLM configuration
27+
# support "internlm/internlm2-chat-7b" and "qwen/qwen-7b-chat-int8"
28+
# support local path, for example
29+
# local_llm_path = "/path/to/your/internlm2"
30+
31+
local_llm_path = "internlm/internlm2-chat-7b"
32+
local_llm_max_text_length = 3000
33+
local_llm_bind_port = 7777
34+
35+
# remote LLM service configuration
36+
# support "gpt", "kimi", "deepseek", "zhipuai", "step", "xi-api" and "alles-apin"
37+
# xi-api and alles-apin is chinese gpt proxy
38+
39+
remote_type = "kimi"
40+
remote_api_key = "YOUR-API-KEY-HERE"
41+
# max text length for remote LLM.
42+
# use 128000 for kimi, 192000 for gpt/xi-api, 16000 for deepseek, 128000 for zhipuai
43+
remote_llm_max_text_length = 128000
44+
# openai API model type, support model list:
45+
# "auto" for kimi. To save money, we auto select model name by prompt length.
46+
# "auto" for step to save money, see https://platform.stepfun.com
47+
# "gpt-4-0613" for gpt/xi-api,
48+
# "deepseek-chat" for deepseek,
49+
# "glm-4" for zhipuai,
50+
# "gpt-4-1106-preview" for alles-apin or OpenAOE
51+
remote_llm_model = "auto"
52+
# request per minute
53+
rpm = 500
54+
55+
[worker]
56+
# enable web search or not
57+
enable_web_search = 1
58+
# enable search enhancement or not
59+
enable_sg_search = 0
60+
save_path = "logs/work.txt"
61+
62+
[worker.time]
63+
enable = 0
64+
start = "00:00:00"
65+
end = "23:59:59"
66+
has_weekday = 1
67+
68+
[sg_search]
69+
# download `src` from https://github.com/sourcegraph/src-cli#installation
70+
binary_src_path = "/usr/local/bin/src"
71+
src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
72+
73+
# add your repo here, we just take opencompass and lmdeploy as example
74+
[sg_search.opencompass]
75+
github_repo_id = "open-compass/opencompass"
76+
introduction = "用于评测大型语言模型(LLM). 它提供了完整的开源可复现的评测框架,支持大语言模型、多模态模型的一站式评测,基于分布式技术,对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度,整合集纳了超过70个评测数据集,合计提供了超过40万个模型评测问题,并提供长文本、安全、代码3类大模型特色技术能力评测。"
77+
# introduction = "For evaluating Large Language Models (LLMs). It provides a fully open-source, reproducible evaluation framework, supporting one-stop evaluation for large language models and multimodal models. Based on distributed technology, it can efficiently evaluate models with a large number of parameters. The evaluation directions are summarized in five capability dimensions: knowledge, language, understanding, reasoning, and examination. It integrates and collects more than 70 evaluation datasets, providing in total over 400,000 model evaluation questions. Additionally, it offers evaluations for three types of capabilities specific to large models: long text, security, and coding."
78+
79+
[sg_search.lmdeploy]
80+
github_repo_id = "internlm/lmdeploy"
81+
introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
82+
# introduction = "lmdeploy is a toolkit for compressing, deploying, and servicing Large Language Models (LLMs). It is a deployment tool for transformer-structured LLMs in server-side scenarios, supporting GPU server-side deployment, ensuring speed, and supporting Tensor Parallel along with optimizations for multiple concurrent processes. It offers comprehensive features including model conversion, cache features for caching historical sessions and more. Additionally, it provides access via WebUI, command line, and gRPC clients."
83+
84+
[frontend]
85+
# chat group assistant type, support "lark", "lark_group", "wechat_personal" and "none"
86+
# for "lark", open https://open.feishu.cn/document/client-docs/bot-v3/add-custom-bot to add bot, **only send, cannot receive**
87+
# for "lark_group", open https://open.feishu.cn/document/home/introduction-to-custom-app-development/self-built-application-development-process to create one
88+
# for "wechat_personal", read ./docs/add_wechat_group_zh.md to setup gateway
89+
type = "none"
90+
91+
# for "lark", it is chat group webhook url, send reply to group, for example "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
92+
# for "lark_group", it is the url to fetch chat group message, for example "http://101.133.161.20:6666/fetch", `101.133.161.20` is your own public IPv4 addr
93+
# for "wechat_personal", it is useless
94+
webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
95+
96+
# when a new group chat message is received, should it be processed immediately or wait for 18 seconds in case the user hasn't finished speaking?
97+
# support "immediate"
98+
message_process_policy = "immediate"
99+
100+
[frontend.lark_group]
101+
# "lark_group" configuration examples, use your own app_id and secret !!!
102+
app_id = "cli_a53a34dcb778500e"
103+
app_secret = "2ajhg1ixSvlNm1bJkH4tJhPfTCsGGHT1"
104+
encrypt_key = "abc"
105+
verification_token = "def"
106+
107+
[frontend.wechat_personal]
108+
# "wechat_personal" listen port
109+
bind_port = 9527

docs/knowledge_graph_zh.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# 混合知识图谱和稠密检索
22

3-
通过混合知识图谱和稠密检索,拒答 F1 提升约 2 个点,它的本质是**给高频词加权**。介绍已同步到[飞书](https://aicarrier.feishu.cn/docx/F51pduYyMof8syxKe5RchiU1nIN)
3+
通过混合知识图谱和稠密检索,拒答 F1 提升约 2 个点,它的本质是**给高频词加权**。介绍已同步到[飞书](https://aicarrier.feishu.cn/docx/F51pduYyMof8syxKe5RchiU1nIN)[知乎](https://zhuanlan.zhihu.com/p/709589834)
44

55
本方案对老版本完美兼容,以下是完整操作步骤。
66

docs/send_only_lark_group_zh.md

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# 单向发到飞书群
2+
3+
这个功能,主要是测试 pipeline 全流程畅通。单向发送的实用意义有限。
4+
5+
点击[创建飞书自定义机器人](https://open.feishu.cn/document/client-docs/bot-v3/add-custom-bot),获取回调 WEBHOOK_URL,填写到 config.ini
6+
7+
```ini
8+
# config.ini
9+
..
10+
[frontend]
11+
type = "lark"
12+
webhook_url = "${YOUR-LARK-WEBHOOK-URL}"
13+
```
14+
15+
运行。结束后,技术助手的答复将**单向**发送到飞书群。
16+
17+
```shell
18+
python3 -m huixiangdou.main --standalone # 非 docker 用户
19+
python3 -m huixiangdou.main # docker 用户
20+
```
21+
22+
<img src="../resource/figures/lark-example.png" width="400">

evaluation/rejection/build_fs_and_filter.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def calculate(chunk_size: int):
116116
# 按不同 chunk_size 和 chunk_size,构建特征库
117117
# 读 input.jsonl 计算 F1
118118
cache = CacheRetriever(config_path=config_path)
119-
fs_init = FeatureStore(embeddings=cache.embeddings,
119+
fs_init = FeatureStore(embedder=cache.embedder,
120120
config_path=config_path,
121121
chunk_size=chunk_size,
122122
analyze_reject=True,
@@ -126,8 +126,7 @@ def calculate(chunk_size: int):
126126
file_opr = FileOperation()
127127
files = file_opr.scan_dir(repo_dir=repo_dir)
128128
# fs_init.preprocess(files=files, work_dir=work_dir)
129-
# fs_init.build_dense_response(files=files, work_dir=work_dir)
130-
# fs_init.build_dense_reject(files=files, work_dir=work_dir)
129+
# fs_init.build_dense(files=files, work_dir=work_dir)
131130
# del fs_init
132131

133132
retriever = CacheRetriever(config_path=config_path).get(

evaluation/rerank/step1_create_candidates.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ def process(param: tuple):
9898
config_path = 'config.ini'
9999
cache = CacheRetriever(config_path=config_path)
100100

101-
fs_init = FeatureStore(embeddings=cache.embeddings,
102-
config_path=config_path)
101+
fs_init = FeatureStore(embedder=cache.embedder, config_path=config_path)
103102

104103
file_opr = FileOperation()
105104
files = file_opr.scan_dir(repo_dir=filedir)

0 commit comments

Comments
 (0)