From fd5f5901639feab64698d08a07d55ff39bb045f3 Mon Sep 17 00:00:00 2001
From: Swarnim Arun <swarnimarun11@gmail.com>
Date: Mon, 29 Apr 2024 15:29:58 +0530
Subject: [PATCH] docs: update use llama-server instead

I created llama-server image over the weekend, it's very small, simple and entirely
static.

No python or interpreted lanugages here, also it uses the .cache dir, as long as it's volume mounted you can easily cache it.
---
 docs/guides/langchain.md | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/docs/guides/langchain.md b/docs/guides/langchain.md
index 432c7ec..7bdbfb5 100644
--- a/docs/guides/langchain.md
+++ b/docs/guides/langchain.md
@@ -42,9 +42,17 @@ spec:
         template:
             spec:
                 containers:
-                    - name: "ai-model-7b"
-                      image: modelzai/llm-llama-7b:latest # GPU requirements: A100(40GB)
-                      # alternatively use, modelzai/llm-bloomz-560m:latest (even works on CPU)
+                    - name: "ai-model"
+                      image: swarnimarun/llama-server:latest-cuda
+                      # GPU requirements: T4(16GB)
+                      # For CPU : "swarnimarun/llama-server:latest" - 16GB
+                      args:
+                        - "-m"
+                        - "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"
+                        - "-g"
+                        - "33"
+                        - "-q"
+                        - "q8"
         accelerator:
             interface: "CUDA"
             minVersion:
@@ -52,13 +60,13 @@ spec:
         resources:
             limits:
                 cpu: "1"
-                memory: "16Gi" # a decent amount of RAM is required for loading the model as well
+                memory: "2Gi" # some amount of RAM is required for loading the model as well, for cpu use atleast 16GB of RAM
 ```
 
 - Port forward the deployment service. If you don't have a proper ingress setup for your cluster.
 
 ```bash
-kubectl port-forward service/aideployment 8000:8000
+kubectl port-forward service/aideployment 80:8000
 ```
 
 - Now locally, install the required libraries.
@@ -73,11 +81,11 @@ pip install langchain openai
 import os
 import openai
 
-# note: we port-forwarded the service to 8000
-openai.api_base="http://localhost:8000"
+# note: we port-forwarded the service to 80 aka http
+openai.api_base="http://localhost"
 # if you have ingress setup then use your domain name
 # you can also modify the port to use http(s) port itself
-# openai.api_base="https://<DOMAIN-NAME>.tld:8000" 
+# openai.api_base="https://<DOMAIN-NAME>.tld" 
 openai.api_key = "any"
 ```