OpenBMB · luyaxi · Nov 21, 2023 · Nov 21, 2023
diff --git a/XAgent/ai_functions/request/xagent.py b/XAgent/ai_functions/request/xagent.py
@@ -1,47 +1,32 @@
-
-import json
-import openai
-
-from openai.error import InvalidRequestError
-
 from XAgent.logs import logger
-from XAgent.config import CONFIG, get_apiconfig_by_model, get_model_name
+from XAgent.config import CONFIG,get_apiconfig_by_model,get_model_name
+import requests
+import traceback
 
 
 def chatcompletion_request(**kwargs):
-    """
-    Performs a chat completion request to OpenAI's Chat Model taking 
-    optional arguments as parameters to customize the request.
-
-    It expects the necessary parameters to be passed in the kwargs to create 
-    a chat completion. By default, it uses the model specified in the CONFIG. 
-    If the context length is exceeded, the function throws an 'InvalidRequestError'.
-
-    Args:
-        **kwargs (dict): The dictionary of parameters passed to the 
-        `openai.ChatCompletion.create` method. The parameters can be any valid 
-        input parameters accepted by this method. A 'model' key is expected 
-        in kwargs. If it exists, it is used, else the model defined in
-        CONFIG is used.
-
-    Returns:
-        dict: response from the `openai.ChatCompletion.create` method.
-
-    Raises:
-        InvalidRequestError: If the context length is exceeded or 
-        any other request error occurs during the process.
-    """
-    model_name = get_model_name(kwargs.pop('model', CONFIG.default_completion_kwargs['model']))
+    # logger.info(f"xagent received {json.dumps(kwargs)}")
+    model_name = get_model_name(kwargs.pop('model',CONFIG.default_completion_kwargs['model']))
     logger.debug("chatcompletion: using " + model_name)
     chatcompletion_kwargs = get_apiconfig_by_model(model_name)
     chatcompletion_kwargs.update(kwargs)
-
-    try:
-        response = openai.ChatCompletion.create(**chatcompletion_kwargs)
-        response = json.loads(str(response))
-        if response['choices'][0]['finish_reason'] == 'length':
-            raise InvalidRequestError('maximum context length exceeded', None)
-    except InvalidRequestError as e:
-        raise e
-
+
+    response = requests.post(
+        chatcompletion_kwargs.get("api_base","http://127.0.0.1:8000/chat/completions"),
+        headers={"accept": "application/json", "Content-Type": "application/json"},
+        json={
+            "model": model_name,
+            "repetition_penalty": chatcompletion_kwargs.get("repetition_penalty", 1.2),
+            "temperature": chatcompletion_kwargs.get("temperature", 0.8),
+            "top_p":chatcompletion_kwargs.get("top_p", 1.0),
+            "frequency_penalty":chatcompletion_kwargs.get("frequency_penalty",0.5),
+            "presence_penalty":chatcompletion_kwargs.get("presence_penalty", 0.0),
+            "max_tokens":chatcompletion_kwargs.get("max_tokens", 4096),
+            "messages": chatcompletion_kwargs.get("messages", []),
+            "arguments": chatcompletion_kwargs.get("arguments", {}),
+            "functions": chatcompletion_kwargs.get("functions", []),
+            "function_call": chatcompletion_kwargs.get("function_call", {}),
+        }
+    ).json()
+
     return response
diff --git a/XAgent/utils.py b/XAgent/utils.py
@@ -8,7 +8,10 @@
 import tiktoken
 from XAgent.config import CONFIG
 
-encoding = tiktoken.encoding_for_model(CONFIG.default_completion_kwargs['model'])
+if CONFIG.default_completion_kwargs['model'] == "xagentllm":
+    encoding = tiktoken.encoding_for_model("gpt-4") # TODO: this is not good
+else:
+    encoding = tiktoken.encoding_for_model(CONFIG.default_completion_kwargs['model'])
 
 def get_token_nums(text:str)->int:
     """

diff --git a/XAgentGen/README.md b/XAgentGen/README.md
@@ -0,0 +1,47 @@
+# 📖 Introduction of XAgentGen
+
+XAgentGen implements the guided generation of the customized model to support the XAgent.
+XAgentGen allows models to generate function calls with the given complex [json schema](https://json-schema.org/understanding-json-schema) just like openai's function calling.
+
+Currently, XAgentGen supports the following models:
+- [XAgentLlama](https://huggingface.co/collections/XAgentTeam/xagentllm-655ae4091c419bb072940e74): the official model of XAgent, which is based on Code-Llama. **Note: the model is still under training, and the preview version is available now.**
+
+
+# 🛠️ 1. Setup for XAgentGen
+You can either pull the pre-built docker image or build the docker image by yourself.
+We do recommend you to pull the pre-built docker image, which is more convenient.
+
+## Pull the pre-built docker image
+```shell
+docker pull xagentteam/xagentgen:test
+docker run -it -p 13520:13520  -v ./XAgenGen:/app:rw -v /host/model/path:/model:rw --gpus all --ipc=host --name xagentgen xagentteam/xagentgen:test python app.py --model-path /model --port 13520
+```
+**Note:** Change the `/host/model/path` to the path of your model directory. The service should be listening on port `13520`.
+
+## Build the docker image by yourself
+Make sure you are at the root dir of the project, and run the following command:
+```shell
+docker build -f dockerfiles/XAgentGen/Dockerfile -t xagentgen:test . 
+```
+Note that the building process may take a long time and the default setting requires at least 64GB memory to build.
+You can low down the memory requirement by changing the `MAX_JOBS` in the dockerfile.
+
+
+After the building process, you can run the docker image by:
+```shell
+docker run -it -p 13520:13520  -v ./XAgenGen:/app:rw -v /host/model/path:/model:rw --gpus all --ipc=host --name xagentgen xagentgen:test python app.py --model-path /model --port 13520
+```
+
+**Note:** Change the `/host/model/path` to the path of your model directory. The service should be listening on port `13520`.
+
+
+# 🎮 2. Use the XAgent with the customized model
+
+You should change the config file to use the customized model. The sample config file is in `assets/xagentllama.yml`.
+Run XAgent with customized model by: 
+```shell
+python run.py --task "find all the prime numbers <=100" --model "xagentllm" --config-file "assets/xagentllama.yml"
+```
+
+
+
diff --git a/XAgentGen/app.py b/XAgentGen/app.py
@@ -0,0 +1,159 @@
+from xgen.parser import FunctionParser
+from xgen.server.datamodel import *
+from xgen.server.message_formater import format
+from xgen.parser import FunctionParser
+import xgen.text.generate as generate
+from xgen.models.transformers import Transformers, TransformersTokenizer
+from vllm.sampling_params import LogitsProcessor
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.utils import random_uuid
+from vllm import SamplingParams
+from typing import List
+import torch
+from addict import Dict
+import json
+from fastapi import FastAPI, Response, status, Request
+from fastapi.middleware.cors import CORSMiddleware
+import argparse
+import uvicorn
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model-path", type=str, help="Path to the model")
+parser.add_argument("--port", type=str, help="Port for server")
+args = parser.parse_args()
+
+model_path = args.model_path
+
+engine_configs = AsyncEngineArgs(
+    worker_use_ray=False,
+    engine_use_ray=False,
+    model=model_path,
+    tokenizer=None,
+    tokenizer_mode='auto',
+    tensor_parallel_size=1,
+    dtype='auto',
+    quantization=None,
+    revision=None,
+    tokenizer_revision=None,
+    seed=0,
+    gpu_memory_utilization=0.9,
+    swap_space=4,
+    disable_log_requests=True,
+    max_num_batched_tokens=16384,
+    max_model_len=16384,
+)
+engine = AsyncLLMEngine.from_engine_args(engine_configs)
+
+print("loading model finished! Service start!")
+
+
+class ConstrainedLogitsProcessor(LogitsProcessor):
+    def __init__(self, extra_arguments, functions, function_call, tokenizer_path, device=None):
+        if function_call is not None and len(function_call) == 0:
+            function_call = None
+        self.dp = FunctionParser()
+        outline_tokenizer = TransformersTokenizer(tokenizer_path)
+        fake_model = Dict()
+        fake_model.device = device
+        model = Transformers(fake_model, outline_tokenizer)
+        self.dp.create_all_functions_model(extra_arguments, functions, function_call)
+        regex_list = self.dp.models_to_regex()
+        self.generator = generate.choice(model, regex_list)
+
+    def __call__(self, generated_token_ids: List[int], logits: torch.Tensor) -> torch.Tensor:
+        generated_token_ids = torch.LongTensor(generated_token_ids).view(1, -1).to(logits.device)
+        masked_logits = self.generator.create_proposal(generated_token_ids, logits.view(1, -1))
+        return masked_logits
+
+@app.post("/chat/health", status_code=200)
+async def health():
+    return "ok"
+
+
+@app.post("/chat/completions")
+async def chat_function(response:Response,request: Request):
+    global engine
+    call_msg = await request.json()
+    model_name = call_msg.get("model","")
+    if model_name != "agentllama" and model_name != "xagentllm":
+        return {"model": "", "choices": [{'message': {'content': f'bad model {model_name}'}, 'finish_reason': 'error', 'index': -1}]}
+    messages = call_msg.get("messages",None)
+    arguments = call_msg.get("arguments",None)
+    functions = call_msg.get("functions",None)
+    function_call = call_msg.get("function_call",None)
+    task_prompt = format({
+        "messages": messages,
+        "arguments": arguments,
+        "functions": functions,
+        "function_call": function_call
+    }, dump_method='json')
+    processor = ConstrainedLogitsProcessor(arguments, functions, function_call, model_path, device='cuda')
+    sampling_params = SamplingParams(
+        temperature=call_msg.get("temperature", 0.8),
+        top_p=call_msg.get("top_p", 1.0),
+        frequency_penalty=call_msg.get("frequency_penalty",0.5),
+        presence_penalty=call_msg.get("presence_penalty", 0.0),
+        repetition_penalty=call_msg.get("repetition_penalty",1.2),
+        max_tokens=call_msg.get("max_tokens", 4000),
+        logits_processors=[processor]
+    )
+    # make request
+    request_id = random_uuid()
+    results_generator = engine.generate(task_prompt, sampling_params, request_id)
+    final_output = None
+    async for request_output in results_generator:
+        if await request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await engine.abort(request_id)
+            return Response(status_code=499)
+        final_output = request_output
+    sequence = final_output.outputs[0].text
+    try:
+        sequence = json.loads(sequence)
+        if "extra_parameters" in sequence:
+            sequence["arguments"] = sequence["extra_parameters"]
+            sequence.pop("extra_parameters")
+    except Exception as e:
+        res = {"status": "fail","broken_json":sequence,"error_message":str(e)}
+    else:
+        res = {
+            "status": "success",
+            "function_res": sequence,
+            "usage":{
+                "prompt_tokens": processor.generator.model.tokenizer.prompt_tokens,
+                "completion_tokens": processor.generator.model.tokenizer.completion_tokens,
+                "total_tokens": processor.generator.model.tokenizer.prompt_tokens + processor.generator.model.tokenizer.completion_tokens
+            }
+        }
+
+    if res["status"] == "fail":
+        response.status_code = 400
+        return {"model": "", "choices": [{'message': {'content': json.dumps(res,ensure_ascii=False)}, 'finish_reason': 'error', 'index': -1}]}
+
+    response_model = {
+        'model': model_name,
+        'usage': res["usage"],
+        'choices':[
+            {
+                "message":{
+                    "content": json.dumps(res["function_res"], ensure_ascii=False)
+                },
+                "finish_reason":"stop",
+                "index":0,
+            }
+        ]
+    }
+    return response_model
+
+uvicorn.run(app, host="127.0.0.1", port=args.port)
diff --git a/XAgentGen/requirements_1.txt b/XAgentGen/requirements_1.txt
@@ -0,0 +1,24 @@
+diffusers==0.15.0
+interegular==0.3.2
+Jinja2==3.1.2
+json5
+lark==1.1.7
+numpy
+perscache==0.6.1
+Pillow
+pytest
+Requests
+scipy
+tenacity
+tiktoken
+transformers==4.35.0
+outlines==0.0.9
+accelerate==0.24.1
+fastapi
+logzero==1.7.0
+orjson
+packaging
+referencing==0.30.2
+torch==2.1.0
+deepspeed==0.12.2
+addict==2.4.0
diff --git a/XAgentGen/requirements_2.txt b/XAgentGen/requirements_2.txt
@@ -0,0 +1,3 @@
+pydantic==1.10.13
+flash-attn==2.0.4
+vllm==0.2.2
diff --git a/XAgentGen/xgen/models/__init__.py b/XAgentGen/xgen/models/__init__.py
@@ -0,0 +1 @@
+from .transformers import transformers,Transformers,TransformersTokenizer
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .transformers import transformers,Transformers,TransformersTokenizer