Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Llama2 inf2 Ray inference upgrade and bug fix #540

Merged
merged 2 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,3 @@ redis:
accessModes:
- ReadWriteOnce
runAsUser: 1000

11 changes: 11 additions & 0 deletions gen-ai/inference/gradio-ui/Dockerfile-gradio-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Use Python base image
FROM --platform=linux/amd64 python:3.9-slim

# Set working directory in the container
WORKDIR /app

# Install necessary Python packages with pinned versions
RUN pip install --no-cache-dir gradio==4.31.5 requests==2.32.2 pillow==10.3.0

# Command to run the Python script
ENTRYPOINT ["python", "/app/gradio-app.py"]
5 changes: 4 additions & 1 deletion gen-ai/inference/llama2-13b-chat-rayserve-inf2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# docker buildx build --platform=linux/amd64 -t ray-serve-llama2:latest .
# https://hub.docker.com/layers/rayproject/ray-ml/2.7.1-py310-gpu/images/sha256-f84ecfc82d255ff9e23b8e40343a95655ec8e23a009633a183769edac6277186?context=explore
FROM rayproject/ray:2.7.1-py310
FROM rayproject/ray:2.22.0-py310

# Maintainer label
LABEL maintainer="DoEKS"
Expand Down Expand Up @@ -29,6 +29,9 @@ RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
# Add Neuron path to PATH
ENV PATH /opt/aws/neuron/bin:$PATH

# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH

WORKDIR /serve_app

COPY ray_serve_llama2.py /serve_app/ray_serve_llama2.py
127 changes: 127 additions & 0 deletions gen-ai/inference/llama2-13b-chat-rayserve-inf2/gradio-ui.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# gradio-deploy.yaml
apiVersion: v1
kind: Namespace
metadata:
name: gradio-llama2-inf2
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: gradio-deployment
namespace: gradio-llama2-inf2
labels:
app: gradio
spec:
replicas: 1
selector:
matchLabels:
app: gradio
template:
metadata:
labels:
app: gradio
spec:
containers:
- name: gradio
image: public.ecr.aws/data-on-eks/gradio-web-app-base:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 7860
resources:
requests:
cpu: "512m"
memory: "2048Mi"
limits:
cpu: "1"
memory: "4096Mi"
env:
- name: MODEL_ENDPOINT
value: "/infer"
- name: SERVICE_NAME
value: "http://llama2-serve-svc.llama2.svc.cluster.local:8000"
volumeMounts:
- name: gradio-app-script
mountPath: /app/gradio-app.py
subPath: gradio-app-llama2-inf2.py
volumes:
- name: gradio-app-script
configMap:
name: gradio-app-script
---
apiVersion: v1
kind: Service
metadata:
name: gradio-service
namespace: gradio-llama2-inf2
spec:
selector:
app: gradio
ports:
- name: http
protocol: TCP
port: 7860
targetPort: 7860
type: ClusterIP
---
apiVersion: v1
kind: ConfigMap
metadata:
name: gradio-app-script
namespace: gradio-llama2-inf2
data:
gradio-app-llama2-inf2.py: |
import gradio as gr
import requests
import os

# Constants for model endpoint and service name
model_endpoint = "/infer"
service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")

# Function to generate text
def text_generation(message, history):
prompt = message

# Create the URL for the inference
url = f"{service_name}{model_endpoint}"

try:
# Send the request to the model service
response = requests.get(url, params={"sentence": prompt}, timeout=180)
response.raise_for_status() # Raise an exception for HTTP errors

full_output = response.json()[0]
# Removing the original question from the output
answer_only = full_output.replace(prompt, "", 1).strip('["]?\n')

# Safety filter to remove harmful or inappropriate content
answer_only = filter_harmful_content(answer_only)
return answer_only
except requests.exceptions.RequestException as e:
# Handle any request exceptions (e.g., connection errors)
return f"AI: Error: {str(e)}"

# Define the safety filter function (you can implement this as needed)
def filter_harmful_content(text):
# TODO: Implement a safety filter to remove any harmful or inappropriate content from the text

# For now, simply return the text as-is
return text

# Define the Gradio ChatInterface
chat_interface = gr.ChatInterface(
text_generation,
chatbot=gr.Chatbot(line_breaks=True),
textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
title="Llama2/3 AI Chat",
description="Ask me any question",
theme="soft",
examples=["How many languages are in India", "What is Generative AI?"],
cache_examples=False,
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
)

# Launch the ChatInterface
chat_interface.launch(server_name="0.0.0.0")
196 changes: 113 additions & 83 deletions gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ apiVersion: v1
kind: Namespace
metadata:
name: llama2

---
apiVersion: ray.io/v1
kind: RayService
Expand All @@ -14,9 +15,32 @@ spec:
serveConfigV2: |
applications:
- name: llama2
import_path: ray_serve_llama2:entrypoint
import_path: "ray_serve_llama2:entrypoint"
runtime_env:
env_vars:
MODEL_ID: "NousResearch/Llama-2-13b-chat-hf"
NEURON_CC_FLAGS: "-O1"
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
NEURON_CORES: "24"
deployments:
- name: Llama-2-13b-chat-hf
autoscaling_config:
metrics_interval_s: 0.2
min_replicas: 1
max_replicas: 1
look_back_period_s: 2
downscale_delay_s: 30
upscale_delay_s: 2
target_num_ongoing_requests_per_replica: 1
graceful_shutdown_timeout_s: 5
ray_actor_options:
num_cpus: 180
resources: {"neuron_cores": 24}
runtime_env:
env_vars:
LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
rayClusterConfig:
rayVersion: '2.7.1'
rayVersion: 2.22.0
headGroupSpec:
headService:
metadata:
Expand All @@ -27,75 +51,81 @@ spec:
template:
spec:
containers:
- name: head
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest # Image created using the Dockerfile attached in the folder
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 4
memory: 20Gi
requests:
cpu: 4
memory: 20Gi
- name: head
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 4
memory: 20Gi
requests:
cpu: 4
memory: 20Gi
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
nodeSelector: # This is using Karpenter Nodes with the provisioner label
instanceType: mixed-x86
provisionerType: Karpenter
workload: rayhead
volumes:
- name: ray-logs
emptyDir: {}
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
- groupName: inf2
replicas: 1
minReplicas: 1
maxReplicas: 1
rayStartParams: {}
template:
spec:
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
resources:
limits:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
requests:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
- groupName: inf2
replicas: 1
minReplicas: 1
maxReplicas: 1
rayStartParams: {}
template:
spec:
containers:
- name: worker
image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest
imagePullPolicy: Always # Ensure the image is always pulled when updated
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
resources:
limits:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
requests:
cpu: "180"
memory: "700G"
aws.amazon.com/neuron: "12"
env:
- name: LD_LIBRARY_PATH
value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
nodeSelector:
instanceType: inferentia-inf2
provisionerType: Karpenter
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
---
apiVersion: networking.k8s.io/v1
kind: Ingress
Expand All @@ -107,21 +137,21 @@ metadata:
spec:
ingressClassName: nginx
rules:
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8000
- http:
paths:
# Ray Dashboard
- path: /dashboard/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8265
# Ray Serve
- path: /serve/(.*)
pathType: ImplementationSpecific
backend:
service:
name: llama2
port:
number: 8000
Loading
Loading