Multi-Node-RAG-LLM-platform/gpu-node/gpu_server.py at main · Anki0909/Multi-Node-RAG-LLM-platform · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import subprocess
import tempfile
import time
from fastapi import FastAPI
from pydantic import BaseModel

LLAMA_BIN = "/usr/local/bin/llama"
MODEL_PATH = "/models/qwen2.5-3b-instruct-q4_k_m.gguf"

app = FastAPI()

class InferenceRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7

@app.post("/infer")
def infer(req: InferenceRequest):
    start = time.time()

    with tempfile.NamedTemporaryFile(mode="w+", delete=True) as f:
        f.write(req.prompt)
        f.flush()

        cmd = [
            LLAMA_BIN,
            "-m", MODEL_PATH,
            "--file", f.name,
            "-n", str(req.max_tokens),
            "--temp", str(req.temperature),
            "--n-gpu-layers", "24",
            "--ctx-size", "2048",
            "--batch-size", "128",
            "--n-gpu-layers", "20"
        ]

        proc = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

    return {
        "text": proc.stdout,
        "latency_ms": int((time.time() - start) * 1000),
        "exit_code": proc.returncode,
        "stderr": proc.stderr
    }