opea-project · lvliang-intel · May 31, 2024 · May 27, 2024 · May 28, 2024 · May 28, 2024
@@ -99,6 +99,13 @@ export vLLM_LLM_ENDPOINT="http://${your_ip}:8008"
 python text-generation/vllm/llm.py
 ```
 
+### 1.4.3 Start the Ray Service
+
+```bash
+export RAY_Serve_ENDPOINT="http://${your_ip}:8008"
+python text-generation/ray_serve/llm.py
+```
+
 # 🚀2. Start Microservice with Docker (Option 2)
 
 If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker.
@@ -127,6 +134,17 @@ export LANGCHAIN_API_KEY=${your_langchain_api_key}
 export LANGCHAIN_PROJECT="opea/llms"
 ```
 
+In order to start Ray serve and LLM services, you need to setup the following environment variables first.
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export RAY_Serve_ENDPOINT="http://${your_ip}:8008"
+export LLM_MODEL=${your_hf_llm_model}
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=${your_langchain_api_key}
+export LANGCHAIN_PROJECT="opea/llms"
+```
+
 ## 2.2 Build Docker Image
 
 ### 2.2.1 TGI
@@ -143,6 +161,13 @@ cd ../../
 docker build -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/Dockerfile .
 ```
 
+### 2.2.3 Ray Serve
+
+```bash
+cd ../../
+docker built -t opeas/llm-ray:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ray_serve/Dockerfile .
+```
+
 To start a docker container, you have two options:
 
 - A. Run Docker with CLI
@@ -164,6 +189,12 @@ docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$htt
 docker run -d --name="llm-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_LLM_ENDPOINT=$vLLM_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL_ID=$LLM_MODEL_ID opea/llm-vllm:latest
 ```
 
+### 2.3.3 Ray Serve
+
+```bash
+docker run -d --name="llm-ray-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e RAY_Serve_ENDPOINT=$RAY_Serve_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e LLM_MODEL=$LLM_MODEL opea/llm-ray:latest
+```
+
 ## 2.4 Run Docker with Docker Compose (Option B)
 
 ### 2.4.1 TGI
@@ -180,6 +211,13 @@ cd text-generation/vllm
 docker compose -f docker_compose_llm.yaml up -d
 ```
 
+### 2.4.3 Ray Serve
+
+```bash
+cd text-genetation/ray_serve
+docker compose -f docker_compose_llm.yaml up -d
+```
+
 # 🚀3. Consume LLM Service
 
 ## 3.1 Check Service Status
@@ -210,7 +248,7 @@ curl http://${your_ip}:9000/v1/chat/completions \
   -H 'Content-Type: application/json'
 ```
 
-## Validated Model
+## 4. Validated Model
 
 | Model                     | TGI-Gaudi | vLLM-CPU | Ray |
 | ------------------------- | --------- | -------- | --- |

@@ -0,0 +1,37 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM langchain/langchain:latest
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/llms/text-generation/ray_serve/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/llms/text-generation/ray_serve
+
+ENTRYPOINT ["python", "llm.py"]
@@ -25,12 +25,8 @@ curl http://127.0.0.1:8080/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
   "model": <model_name>,
-  "messages": [
-        {"role": "assistant", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is Deep Learning?"},
-    ],
+  "messages": [{"role": "user", "content": "What is deep learning?"}],
   "max_tokens": 32,
-  "stream": True
   }'
 ```
 

@@ -22,7 +22,7 @@
 from ray import serve
 from ray_serve.api_openai_backend.query_client import RouterQueryClient
 from ray_serve.api_openai_backend.router_app import Router, router_app
-from ray_serve.ray_serve import LLMServe
+from ray_serve.serve import LLMServe
 
 
 def router_application(deployments, max_concurrent_queries):
@@ -102,14 +102,14 @@ def main(argv=None):
 
     ray.init(address="auto")
 
-    host_port = os.environ.get("RAY_Serve_ENDPOINT", "http://127.0.0.1:8080")
+    host_port = os.environ.get("RAY_Serve_ENDPOINT", "http://0.0.0.0:8080")
     host = re.search(r"([\d\.]+)", host_port).group(1)
     port = args.port_number
     model_name = args.model_id_or_path.split("/")[-1] if args.model_id_or_path else ""
     route_prefix = "/"
 
     infer_conf = {}
-    infer_conf["use_auth_token"] = os.environ.get("HUGGINGFACEHUB_API_TOKEN", None)
+    infer_conf["use_auth_token"] = os.environ.get("HF_TOKEN", None)
     infer_conf["trust_remote_code"] = os.environ.get("TRUST_REMOTE_CODE", None)
     infer_conf["model_id_or_path"] = args.model_id_or_path
     infer_conf["chat_processor"] = args.chat_processor

@@ -0,0 +1,56 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: "3.8"
+
+services:
+  ray_service:
+    image: rayllm:habana
+    container_name: ray-service
+    ports:
+      - "8008:80"
+    runtime: habana
+    environment:
+      - OMPI_MCA_btl_vader_single_copy_mechanism=none
+      - HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+      - TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE}
+      - LLM_MODEL=${LLM_MODEL}
+      - CHAT_PROCESSOR=${CHAT_PROCESSOR}
+    cap_add:
+      - SYS_NICE
+    command: >
+      /bin/bash -c "ray start --head &&
+      python api_server_openai.py --port_number 80
+      --model_id_or_path ${LLM_MODEL}
+      --chat_processor ${CHAT_PROCESSOR}
+      --num_cpus_per_worker 8
+      --num_hpus_per_worker 1"
+  llm:
+    image: opea/gen-ai-comps:llm-ray-server
+    container_name: llm-ray-server
+    ports:
+      - "9000:9000"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      RAY_Serve_ENDPOINT: ${RAY_Serve_ENDPOINT}
+      LLM_MODEL: ${LLM_MODEL}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
@@ -41,4 +41,4 @@ if [ "$#" -lt 0 ] || [ "$#" -gt 5 ]; then
 fi
 
 # Build the Docker run command based on the number of cards
-docker run -it --runtime=habana --name="ChatQnA_server" -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --network=host -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TRUST_REMOTE_CODE=$TRUST_REMOTE_CODE ray_serve:habana /bin/bash -c "ray start --head && python api_server_openai.py --port_number $port_number --model_id_or_path $model_name --chat_processor $chat_processor --num_cpus_per_worker $num_cpus_per_worker --num_hpus_per_worker $num_hpus_per_worker"
+docker run -it --runtime=habana --name="ChatQnA_server" -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -p $port_number:$port_number -e HF_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e TRUST_REMOTE_CODE=$TRUST_REMOTE_CODE ray_serve:habana /bin/bash -c "ray start --head && python api_server_openai.py --port_number $port_number --model_id_or_path $model_name --chat_processor $chat_processor --num_cpus_per_worker $num_cpus_per_worker --num_hpus_per_worker $num_hpus_per_worker"
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from fastapi.responses import StreamingResponse
+from langchain_openai import ChatOpenAI
+from langsmith import traceable
+
+from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+
+
+@traceable(run_type="tool")
+def post_process_text(text: str):
+    if text == " ":
+        return "data: @#$\n\n"
+    if text == "\n":
+        return "data: <br/>\n\n"
+    if text.isspace():
+        return None
+    new_text = text.replace(" ", "@#$")
+    return f"data: {new_text}\n\n"
+
+
+@register_microservice(
+    name="opea_service@llm_ray",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=9000,
+)
+@traceable(run_type="llm")
+def llm_generate(input: LLMParamsDoc):
+    llm_endpoint = os.getenv("RAY_Serve_ENDPOINT", "http://localhost:8080")
+    llm_model = os.getenv("LLM_MODEL", "Llama-2-7b-chat-hf")
+    llm = ChatOpenAI(
+        openai_api_base=llm_endpoint + "/v1",
+        model_name=llm_model,
+        openai_api_key=os.getenv("OPENAI_API_KEY", "not_needed"),
+        max_tokens=input.max_new_tokens,
+        temperature=input.temperature,
+        streaming=input.streaming,
+        request_timeout=600,
+    )
+
+    if input.streaming:
+
+        async def stream_generator():
+            chat_response = ""
+            async for text in llm.astream(input.query):
+                text = text.content
+                chat_response += text
+                processed_text = post_process_text(text)
+                if text and processed_text:
+                    if "</s>" in text:
+                        res = text.split("</s>")[0]
+                        if res != "":
+                            yield res
+                        break
+                    yield processed_text
+            print(f"[llm - chat_stream] stream response: {chat_response}")
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    else:
+        response = llm.invoke(input.query)
+        response = response.content
+        return GeneratedDoc(text=response, prompt=input.query)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_ray"].start()
@@ -0,0 +1,14 @@
+docarray[full]
+fastapi
+huggingface_hub
+langchain==0.1.16
+langchain_openai
+langserve
+langsmith
+openai
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+ray[serve]>=2.10
+shortuuid
+transformers