docs: use llama-server for simple guides instead (#34)

* docs: update use llama-server instead * docs: explain issues with low RAM
premAI-io · May 6, 2024 · ee76147 · ee76147
1 parent ea246b0
commit ee76147
Showing 1 changed file with 18 additions and 8 deletions.
diff --git a/docs/guides/langchain.md b/docs/guides/langchain.md
@@ -42,23 +42,33 @@ spec:
         template:
             spec:
                 containers:
-                    - name: "ai-model-7b"
-                      image: modelzai/llm-llama-7b:latest # GPU requirements: A100(40GB)
-                      # alternatively use, modelzai/llm-bloomz-560m:latest (even works on CPU)
+                    - name: "ai-model"
+                      image: swarnimarun/llama-server:latest-cuda
+                      # GPU requirements: T4(16GB)
+                      # For CPU : "swarnimarun/llama-server:latest" - 16GB
+                      args:
+                        - "-m"
+                        - "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"
+                        - "-g"
+                        - "33"
+                        - "-q"
+                        - "q8"
         accelerator:
             interface: "CUDA"
             minVersion:
                 major: 7
         resources:
             limits:
                 cpu: "1"
-                memory: "16Gi" # a decent amount of RAM is required for loading the model as well
+                memory: "2Gi" # loading of model maybe slow or buggy for large models with low RAM
+                # for faster initial loading of large models increase to at least 8GB of RAM
+                # if you want to use CPU inference, use at least 16GB of RAM for 7B models
 ```
 
 - Port forward the deployment service. If you don't have a proper ingress setup for your cluster.
 
 ```bash
-kubectl port-forward service/aideployment 8000:8000
+kubectl port-forward service/aideployment 80:8000
 ```
 
 - Now locally, install the required libraries.
@@ -73,11 +83,11 @@ pip install langchain openai
 import os
 import openai
 
-# note: we port-forwarded the service to 8000
-openai.api_base="http://localhost:8000"
+# note: we port-forwarded the service to 80 aka http
+openai.api_base="http://localhost"
 # if you have ingress setup then use your domain name
 # you can also modify the port to use http(s) port itself
-# openai.api_base="https://<DOMAIN-NAME>.tld:8000" 
+# openai.api_base="https://<DOMAIN-NAME>.tld" 
 openai.api_key = "any"
 ```