awslabs · vara-bonthu · May 28, 2024 · May 27, 2024 · May 27, 2024
diff --git a/analytics/terraform/superset-on-eks/helm-values/superset-values.yaml b/analytics/terraform/superset-on-eks/helm-values/superset-values.yaml
@@ -91,4 +91,3 @@ redis:
       accessModes:
         - ReadWriteOnce
 runAsUser: 1000
-
diff --git a/gen-ai/inference/gradio-ui/Dockerfile-gradio-base b/gen-ai/inference/gradio-ui/Dockerfile-gradio-base
@@ -0,0 +1,11 @@
+# Use Python base image
+FROM --platform=linux/amd64 python:3.9-slim
+
+# Set working directory in the container
+WORKDIR /app
+
+# Install necessary Python packages with pinned versions
+RUN pip install --no-cache-dir gradio==4.31.5 requests==2.32.2 pillow==10.3.0
+
+# Command to run the Python script
+ENTRYPOINT ["python", "/app/gradio-app.py"]
diff --git a/gen-ai/inference/llama2-13b-chat-rayserve-inf2/Dockerfile b/gen-ai/inference/llama2-13b-chat-rayserve-inf2/Dockerfile
@@ -1,6 +1,6 @@
 # docker buildx build --platform=linux/amd64 -t ray-serve-llama2:latest .
 # https://hub.docker.com/layers/rayproject/ray-ml/2.7.1-py310-gpu/images/sha256-f84ecfc82d255ff9e23b8e40343a95655ec8e23a009633a183769edac6277186?context=explore
-FROM rayproject/ray:2.7.1-py310
+FROM rayproject/ray:2.22.0-py310
 
 # Maintainer label
 LABEL maintainer="DoEKS"
@@ -29,6 +29,9 @@ RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
 # Add Neuron path to PATH
 ENV PATH /opt/aws/neuron/bin:$PATH
 
+# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
+ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH
+
 WORKDIR /serve_app
 
 COPY ray_serve_llama2.py /serve_app/ray_serve_llama2.py
diff --git a/gen-ai/inference/llama2-13b-chat-rayserve-inf2/gradio-ui.yaml b/gen-ai/inference/llama2-13b-chat-rayserve-inf2/gradio-ui.yaml
@@ -0,0 +1,127 @@
+# gradio-deploy.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gradio-llama2-inf2
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gradio-deployment
+  namespace: gradio-llama2-inf2
+  labels:
+    app: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+        - name: gradio
+          image: public.ecr.aws/data-on-eks/gradio-web-app-base:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 7860
+          resources:
+            requests:
+              cpu: "512m"
+              memory: "2048Mi"
+            limits:
+              cpu: "1"
+              memory: "4096Mi"
+          env:
+            - name: MODEL_ENDPOINT
+              value: "/infer"
+            - name: SERVICE_NAME
+              value: "http://llama2-serve-svc.llama2.svc.cluster.local:8000"
+          volumeMounts:
+            - name: gradio-app-script
+              mountPath: /app/gradio-app.py
+              subPath: gradio-app-llama2-inf2.py
+      volumes:
+        - name: gradio-app-script
+          configMap:
+            name: gradio-app-script
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio-service
+  namespace: gradio-llama2-inf2
+spec:
+  selector:
+    app: gradio
+  ports:
+    - name: http
+      protocol: TCP
+      port: 7860
+      targetPort: 7860
+  type: ClusterIP
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gradio-app-script
+  namespace: gradio-llama2-inf2
+data:
+  gradio-app-llama2-inf2.py: |
+    import gradio as gr
+    import requests
+    import os
+
+    # Constants for model endpoint and service name
+    model_endpoint = "/infer"
+    service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")
+
+    # Function to generate text
+    def text_generation(message, history):
+        prompt = message
+
+        # Create the URL for the inference
+        url = f"{service_name}{model_endpoint}"
+
+        try:
+            # Send the request to the model service
+            response = requests.get(url, params={"sentence": prompt}, timeout=180)
+            response.raise_for_status()  # Raise an exception for HTTP errors
+
+            full_output = response.json()[0]
+            # Removing the original question from the output
+            answer_only = full_output.replace(prompt, "", 1).strip('["]?\n')
+
+            # Safety filter to remove harmful or inappropriate content
+            answer_only = filter_harmful_content(answer_only)
+            return answer_only
+        except requests.exceptions.RequestException as e:
+            # Handle any request exceptions (e.g., connection errors)
+            return f"AI: Error: {str(e)}"
+
+    # Define the safety filter function (you can implement this as needed)
+    def filter_harmful_content(text):
+        # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text
+
+        # For now, simply return the text as-is
+        return text
+
+    # Define the Gradio ChatInterface
+    chat_interface = gr.ChatInterface(
+        text_generation,
+        chatbot=gr.Chatbot(line_breaks=True),
+        textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
+        title="Llama2/3 AI Chat",
+        description="Ask me any question",
+        theme="soft",
+        examples=["How many languages are in India", "What is Generative AI?"],
+        cache_examples=False,
+        retry_btn=None,
+        undo_btn="Delete Previous",
+        clear_btn="Clear",
+    )
+
+    # Launch the ChatInterface
+    chat_interface.launch(server_name="0.0.0.0")
diff --git a/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml b/gen-ai/inference/llama2-13b-chat-rayserve-inf2/ray-service-llama2.yaml
@@ -2,6 +2,7 @@ apiVersion: v1
 kind: Namespace
 metadata:
   name: llama2
+
 ---
 apiVersion: ray.io/v1
 kind: RayService
@@ -14,9 +15,32 @@ spec:
   serveConfigV2: |
     applications:
     - name: llama2
-      import_path: ray_serve_llama2:entrypoint
+      import_path: "ray_serve_llama2:entrypoint"
+      runtime_env:
+        env_vars:
+          MODEL_ID: "NousResearch/Llama-2-13b-chat-hf"
+          NEURON_CC_FLAGS: "-O1"
+          LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+          NEURON_CORES: "24"
+      deployments:
+        - name: Llama-2-13b-chat-hf
+          autoscaling_config:
+            metrics_interval_s: 0.2
+            min_replicas: 1
+            max_replicas: 1
+            look_back_period_s: 2
+            downscale_delay_s: 30
+            upscale_delay_s: 2
+            target_num_ongoing_requests_per_replica: 1
+          graceful_shutdown_timeout_s: 5
+          ray_actor_options:
+            num_cpus: 180
+            resources: {"neuron_cores": 24}
+            runtime_env:
+              env_vars:
+                LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
   rayClusterConfig:
-    rayVersion: '2.7.1'
+    rayVersion: 2.22.0
     headGroupSpec:
       headService:
         metadata:
@@ -27,75 +51,81 @@ spec:
       template:
         spec:
           containers:
-          - name: head
-            image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest # Image created using the Dockerfile attached in the folder
-            imagePullPolicy: Always # Ensure the image is always pulled when updated
-            lifecycle:
-              preStop:
-                exec:
-                  command: ["/bin/sh", "-c", "ray stop"]
-            ports:
-            - containerPort: 6379
-              name: gcs
-            - containerPort: 8265
-              name: dashboard
-            - containerPort: 10001
-              name: client
-            - containerPort: 8000
-              name: serve
-            volumeMounts:
-            - mountPath: /tmp/ray
-              name: ray-logs
-            resources:
-              limits:
-                cpu: 4
-                memory: 20Gi
-              requests:
-                cpu: 4
-                memory: 20Gi
+            - name: head
+              image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest # Image created using the Dockerfile attached in the folder
+              imagePullPolicy: Always # Ensure the image is always pulled when updated
+              lifecycle:
+                preStop:
+                  exec:
+                    command: ["/bin/sh", "-c", "ray stop"]
+              ports:
+                - containerPort: 6379
+                  name: gcs
+                - containerPort: 8265
+                  name: dashboard
+                - containerPort: 10001
+                  name: client
+                - containerPort: 8000
+                  name: serve
+              volumeMounts:
+                - mountPath: /tmp/ray
+                  name: ray-logs
+              resources:
+                limits:
+                  cpu: 4
+                  memory: 20Gi
+                requests:
+                  cpu: 4
+                  memory: 20Gi
+              env:
+                - name: LD_LIBRARY_PATH
+                  value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
           nodeSelector: # This is using Karpenter Nodes with the provisioner label
             instanceType: mixed-x86
             provisionerType: Karpenter
             workload: rayhead
           volumes:
-          - name: ray-logs
-            emptyDir: {}
+            - name: ray-logs
+              emptyDir: {}
     workerGroupSpecs:
-    - groupName: inf2
-      replicas: 1
-      minReplicas: 1
-      maxReplicas: 1
-      rayStartParams: {}
-      template:
-        spec:
-          containers:
-          - name: worker
-            image: public.ecr.aws/data-on-eks/ray-serve-inf2-llama2:latest
-            imagePullPolicy: Always # Ensure the image is always pulled when updated
-            lifecycle:
-              preStop:
-                exec:
-                  command: ["/bin/sh", "-c", "ray stop"]
-            resources:
-              limits:
-                cpu: "180"
-                memory: "700G"
-                aws.amazon.com/neuron: "12"
-              requests:
-                cpu: "180"
-                memory: "700G"
-                aws.amazon.com/neuron: "12"
-          nodeSelector:
-            instanceType: inferentia-inf2
-            provisionerType: Karpenter
-          tolerations:
-          - key: "aws.amazon.com/neuron"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated"
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
+      - groupName: inf2
+        replicas: 1
+        minReplicas: 1
+        maxReplicas: 1
+        rayStartParams: {}
+        template:
+          spec:
+            containers:
+              - name: worker
+                image: public.ecr.aws/data-on-eks/ray2.22.0-py310-llama2-13b-neuron:latest
+                imagePullPolicy: Always # Ensure the image is always pulled when updated
+                lifecycle:
+                  preStop:
+                    exec:
+                      command: ["/bin/sh", "-c", "ray stop"]
+                resources:
+                  limits:
+                    cpu: "180"
+                    memory: "700G"
+                    aws.amazon.com/neuron: "12"
+                  requests:
+                    cpu: "180"
+                    memory: "700G"
+                    aws.amazon.com/neuron: "12"
+                env:
+                  - name: LD_LIBRARY_PATH
+                    value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+            nodeSelector:
+              instanceType: inferentia-inf2
+              provisionerType: Karpenter
+            tolerations:
+              - key: "aws.amazon.com/neuron"
+                operator: "Exists"
+                effect: "NoSchedule"
+              - key: "hub.jupyter.org/dedicated"
+                operator: "Equal"
+                value: "user"
+                effect: "NoSchedule"
 ---
 apiVersion: networking.k8s.io/v1
 kind: Ingress
@@ -107,21 +137,21 @@ metadata:
 spec:
   ingressClassName: nginx
   rules:
-  - http:
-      paths:
-      # Ray Dashboard
-      - path: /dashboard/(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: llama2
-            port:
-              number: 8265
-      # Ray Serve
-      - path: /serve/(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: llama2
-            port:
-              number: 8000
+    - http:
+        paths:
+          # Ray Dashboard
+          - path: /dashboard/(.*)
+            pathType: ImplementationSpecific
+            backend:
+              service:
+                name: llama2
+                port:
+                  number: 8265
+          # Ray Serve
+          - path: /serve/(.*)
+            pathType: ImplementationSpecific
+            backend:
+              service:
+                name: llama2
+                port:
+                  number: 8000