-
Notifications
You must be signed in to change notification settings - Fork 110
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #72 from alipay/dev_fanen
Add: Chromadb docs, dashscope embedding. Change: Fix opentelemetry-exporter-otlp-proto-grpc version to 1.25.0
- Loading branch information
Showing
6 changed files
with
204 additions
and
3 deletions.
There are no files selected for viewing
156 changes: 156 additions & 0 deletions
156
agentuniverse/agent/action/knowledge/embedding/dashscope_embedding.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
# !/usr/bin/env python3 | ||
# -*- coding:utf-8 -*- | ||
|
||
# @Time : 2024/6/12 11:43 | ||
# @Author : wangchongshi | ||
# @Email : wangchongshi.wcs@antgroup.com | ||
# @FileName: dashscope_embedding.py | ||
import aiohttp | ||
import requests | ||
from typing import List, Generator, Optional | ||
import json | ||
|
||
from agentuniverse.base.util.env_util import get_from_env | ||
from agentuniverse.agent.action.knowledge.embedding.embedding import Embedding | ||
|
||
# Dashscope support max 25 string in one batch, each string max tokens is 2048. | ||
DASHSCOPE_MAX_BATCH_SIZE = 25 | ||
DASHSCOPE_EMBEDDING_URL = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding" | ||
|
||
|
||
def batched(inputs: List, | ||
batch_size: int = DASHSCOPE_MAX_BATCH_SIZE) -> Generator[List, None, None]: | ||
# Split input string list, due to dashscope support 25 strings in one call. | ||
for i in range(0, len(inputs), batch_size): | ||
yield inputs[i:i + batch_size] | ||
|
||
|
||
class DashscopeEmbedding(Embedding): | ||
"""The Dashscope embedding class.""" | ||
dashscope_api_key: Optional[str] = None | ||
|
||
def __init__(self, **kwargs): | ||
"""Initialize the dashscope embedding class, need dashscope api key.""" | ||
super().__init__(**kwargs) | ||
self.dashscope_api_key = get_from_env("DASHSCOPE_API_KEY") | ||
if not self.dashscope_api_key: | ||
raise Exception("No DASHSCOPE_API_KEY in your environment.") | ||
|
||
|
||
def get_embeddings(self, texts: List[str]) -> List[List[float]]: | ||
""" | ||
Retrieve text embeddings for a list of input texts. | ||
This function interfaces with the DashScope embedding API to obtain | ||
embeddings for a batch of input texts. It handles batching of input texts | ||
to ensure efficient API calls. Each text is processed using the specified | ||
embedding model. | ||
Args: | ||
texts (List[str]): A list of input texts to be embedded. | ||
Returns: | ||
List[List[float]]: A list of embeddings corresponding to the input texts. | ||
Raises: | ||
Exception: If the API call to DashScope fails, an exception is raised with | ||
the respective error code and message. | ||
""" | ||
def post(post_params): | ||
response = requests.post( | ||
url=DASHSCOPE_EMBEDDING_URL, | ||
headers={ | ||
"Content-Type": "application/json", | ||
"Authorization": f"Bearer {self.dashscope_api_key}" | ||
}, | ||
data=json.dumps(post_params, ensure_ascii=False).encode( | ||
"utf-8"), | ||
timeout=120 | ||
) | ||
resp_json = response.json() | ||
return resp_json | ||
|
||
result = [] | ||
post_params = { | ||
"model": self.embedding_model_name, | ||
"input": {}, | ||
"parameters": { | ||
"text_type": "query" | ||
} | ||
} | ||
|
||
for batch in batched(texts): | ||
post_params["input"]["texts"] = batch | ||
resp_json: dict = post(post_params) | ||
data = resp_json.get("output") | ||
if data: | ||
data = data["embeddings"] | ||
batch_result = [d['embedding'] for d in data if 'embedding' in d] | ||
result += batch_result | ||
else: | ||
error_code = resp_json.get("code", "") | ||
error_message = resp_json.get("message", "") | ||
raise Exception(f"Failed to call dashscope embedding api, " | ||
f"error code:{error_code}, " | ||
f"error message:{error_message}") | ||
return result | ||
|
||
async def async_get_embeddings(self, texts: List[str]) -> List[List[float]]: | ||
""" | ||
Async version of get_embeddings. | ||
This function interfaces with the DashScope embedding API to obtain | ||
embeddings for a batch of input texts. It handles batching of input texts | ||
to ensure efficient API calls. Each text is processed using the specified | ||
embedding model. | ||
Args: | ||
texts (List[str]): A list of input texts to be embedded. | ||
Returns: | ||
List[List[float]]: A list of embeddings corresponding to the input texts. | ||
Raises: | ||
Exception: If the API call to DashScope fails, an exception is raised with | ||
the respective error code and message. | ||
""" | ||
async def async_post(post_params): | ||
async with aiohttp.ClientSession() as session: | ||
async with await session.post( | ||
url=DASHSCOPE_EMBEDDING_URL, | ||
headers={ | ||
"Content-Type": "application/json", | ||
"Authorization": f"Bearer {self.dashscope_api_key}" | ||
}, | ||
data=json.dumps(post_params, ensure_ascii=False).encode( | ||
"utf-8"), | ||
timeout=120, | ||
) as resp: | ||
resp_json = await resp.json() | ||
return resp_json | ||
|
||
result = [] | ||
post_params = { | ||
"model": self.embedding_model_name, | ||
"input": {}, | ||
"parameters": { | ||
"text_type": "query" | ||
} | ||
} | ||
|
||
for batch in batched(texts): | ||
post_params["input"]["texts"] = batch | ||
resp_json: dict = await async_post(post_params) | ||
data = resp_json.get("output") | ||
if data: | ||
data = data["embeddings"] | ||
batch_result = [d['embedding'] for d in data if | ||
'embedding' in d] | ||
result += batch_result | ||
else: | ||
error_code = resp_json.get("code", "") | ||
error_message = resp_json.get("message", "") | ||
raise Exception(f"Failed to call dashscope embedding api, " | ||
f"error code:{error_code}, " | ||
f"error message:{error_message}") | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
## ChromaDB | ||
|
||
The agentUniverse already integrates ChromaDB-related dependencies, so you do not need to install additional packages to use ChromaDB's features. | ||
If you want to learn about the underlying principles of ChromaDB, you can visit the [official ChromaDB website](https://www.trychroma.com/). | ||
|
||
### What can I do with ChromaDB? | ||
|
||
You can use ChromaDB in the [Knowledge component]() to store and query knowledge. You can create a storage component using ChromaDB with the following method: | ||
```python | ||
from agentuniverse.agent.action.knowledge.embedding.openai_embedding import OpenAIEmbedding | ||
from agentuniverse.agent.action.knowledge.knowledge import Knowledge | ||
from agentuniverse.agent.action.knowledge.store.chroma_store import ChromaStore | ||
|
||
|
||
init_params = dict() | ||
init_params['name'] = 'test_knowledge' | ||
init_params['description'] = 'test_knowledge_description' | ||
init_params['store'] = ChromaStore(collection_name="test_knowledge", embedding_model=OpenAIEmbedding( | ||
embedding_model_name='text-embedding-ada-002')) | ||
knowledge = Knowledge(**init_params) | ||
``` | ||
The above code will create a Knowledge component based on ChromaDB. For more details on how to use the Knowledge component, you can refer to the [Knowledge component](),or check the code in `tests/test_agentuniverse/unit/agent/action/knowledge/test_knowledge.py`。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
## ChromaDB | ||
|
||
agentUniverse中已集成ChromaDB相关依赖,您无需额外安装包即可使用ChromaDB的相关功能。 | ||
如果您想学习ChromaDB相关的底层原理,您可以查阅ChromaDB的[官方网站](https://www.trychroma.com/)。 | ||
|
||
### 我可以用ChromaDB做些什么 | ||
|
||
您可以在[Knowledge组件]()中使用ChromaDB来存储和查询知识,你可以使用以下方式来创建一个使用ChromaDB的存储组件: | ||
```python | ||
from agentuniverse.agent.action.knowledge.embedding.openai_embedding import OpenAIEmbedding | ||
from agentuniverse.agent.action.knowledge.knowledge import Knowledge | ||
from agentuniverse.agent.action.knowledge.store.chroma_store import ChromaStore | ||
|
||
|
||
init_params = dict() | ||
init_params['name'] = 'test_knowledge' | ||
init_params['description'] = 'test_knowledge_description' | ||
init_params['store'] = ChromaStore(collection_name="test_knowledge", embedding_model=OpenAIEmbedding( | ||
embedding_model_name='text-embedding-ada-002')) | ||
knowledge = Knowledge(**init_params) | ||
``` | ||
上面的代码会创建一个基于ChromaDB的Knowledge,关于Knowledge的具体用法您可以参考[Knowledge组件](),或是参考代码`tests/test_agentuniverse/unit/agent/action/knowledge/test_knowledge.py`。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters