multichat

grctest · grctest · commit 7ec201822c78 · 2025-06-05T11:01:43.000+01:00
diff --git a/README.md b/README.md
@@ -41,4 +41,16 @@ Run the docker image:
 docker run -d --name ai_container -p 8080:8080 fastapi_bitnet
 ```
 
-Once it's running navigate to http://127.0.0.1:8080/docs
+Once it's running navigate to http://127.0.0.1:8080/docs
+
+## Docker hub repository
+
+You can fetch the dockerfile at: https://hub.docker.com/repository/docker/grctest/fastapi_bitnet/general
+
+## How to add to VSCode!
+
+Run the dockerfile locally using the command above, then navigate to the VSCode Copilot chat window and find the wrench icon "Configure Tools...".
+
+In the tool configuration overview scroll to the bottom and select 'Add more tools...' then '+ Add MCP Server' then 'HTTP'.
+
+Enter into the URL field `http://127.0.0.1:8080/mcp` then your copilot will be able to launch new bitnet server instances and chat with them.
diff --git a/app/lib/__init__.py b/app/lib/__init__.py
@@ -0,0 +1,9 @@
+from .endpoints import ChatRequest
+from typing import List
+from pydantic import BaseModel
+
+__all__ = ["ChatRequest", "MultiChatRequest"]
+
+# Re-export for import convenience
+class MultiChatRequest(BaseModel):
+    requests: List[ChatRequest]
diff --git a/app/lib/endpoints.py b/app/lib/endpoints.py
@@ -1,4 +1,4 @@
-# --- Braincell Orchestrator (Middleman Proxy) ---
+# --- bitnet Orchestrator (Middleman Proxy) ---
 from pydantic import BaseModel
 
 from fastapi import FastAPI, HTTPException, Query, Depends
@@ -10,6 +10,11 @@
 import time
 import httpx
 
+from typing import List
+from pydantic import BaseModel, Field
+from fastapi import HTTPException
+import asyncio
+
 # --- Server Process Management ---
 # Each server instance is tracked by a unique (host, port) key
 server_processes = {}
@@ -40,12 +45,11 @@ def _max_threads():
     return os.cpu_count() or 1
 
 async def initialize_server_endpoint(
-    model: ModelEnum,
-    threads: int = Query(os.cpu_count() // 2, gt=0, le=os.cpu_count()),
+    threads: int = Query(1, gt=0, le=os.cpu_count()),
     ctx_size: int = Query(2048, gt=0),
-    port: int = Query(8081, gt=1023),
+    port: int = Query(8081, gt=8080, le=65535),
     system_prompt: str = Query("You are a helpful assistant.", description="Unique system prompt for this server instance"),
-    n_predict: int = Query(4096, gt=0, description="Number of tokens to predict for the server instance"),
+    n_predict: int = Query(256, gt=0, description="Number of tokens to predict for the server instance."),
     temperature: float = Query(0.8, gt=0.0, le=2.0, description="Temperature for sampling")
 ):
     """
@@ -71,7 +75,7 @@ async def initialize_server_endpoint(
         raise HTTPException(status_code=429, detail=f"Cannot start server: would oversubscribe CPU threads (in use: {threads_in_use}, requested: {threads}, max: {max_threads})")
     command = [
         server_path,
-        '-m', model.value,
+        '-m', "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf",
         '-c', str(ctx_size),
         '-t', str(threads),
         '-n', str(n_predict),
@@ -96,7 +100,7 @@ async def initialize_server_endpoint(
             raise HTTPException(status_code=500, detail=f"Server failed to start. Stderr: {stderr_output}")
         server_processes[key] = proc
         server_configs[key] = {
-            "model": model.value,
+            "model": "models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf",
             "threads": threads,
             "ctx_size": ctx_size,
             "host": host,
@@ -241,43 +245,70 @@ def get_model_sizes():
 
 class ChatRequest(BaseModel):
     message: str
-    port: int
-    # Optionally add user/session id, etc.
+    port: int = 8081
+    threads: int = 1
+    ctx_size: int = 2048
+    n_predict: int = 256
+    temperature: float = 0.8
 
-def chat_with_braincell(
+def chat_with_bitnet(
     chat: ChatRequest
 ):
     """
-    Middleman endpoint: receives a chat message and forwards it to the specified braincell (llama server instance) by port.
-    Returns the response from the braincell.
+    Middleman endpoint: receives a chat message and forwards it to the specified bitnet (llama server instance) by port.
+    Returns the response from the bitnet.
     """
     host = "127.0.0.1"
     key = (host, chat.port)
     proc = server_processes.get(key)
     cfg = server_configs.get(key)
     if not (proc and proc.poll() is None and cfg):
-        raise HTTPException(status_code=503, detail=f"Braincell server not running on {host}:{chat.port}. Initialize it first.")
+        raise HTTPException(status_code=503, detail=f"bitnet server not running on {host}:{chat.port}. Initialize it first.")
     server_url = f"http://{host}:{chat.port}/completion"
     payload = {
-        "prompt": chat.message
+        "prompt": chat.message,
+        "threads": chat.threads,
+        "ctx_size": chat.ctx_size,
+        "n_predict": chat.n_predict,
+        "temperature": chat.temperature
     }
     async def _chat():
         async with httpx.AsyncClient() as client:
             try:
-                response = await client.post(server_url, json=payload, timeout=120.0)
+                response = await client.post(server_url, json=payload, timeout=180.0)
                 response.raise_for_status()
                 result_data = response.json()
                 content = result_data.get("content", result_data)
                 return {"result": content}
             except httpx.TimeoutException:
-                raise HTTPException(status_code=504, detail="Request to braincell server timed out.")
+                raise HTTPException(status_code=504, detail="Request to bitnet server timed out.")
             except httpx.ConnectError:
-                raise HTTPException(status_code=503, detail=f"Could not connect to braincell server at {server_url}. Is it running?")
+                raise HTTPException(status_code=503, detail=f"Could not connect to bitnet server at {server_url}. Is it running?")
             except httpx.RequestError as e:
-                raise HTTPException(status_code=500, detail=f"Error during request to braincell server: {str(e)}")
+                raise HTTPException(status_code=500, detail=f"Error during request to bitnet server: {str(e)}")
             except httpx.HTTPStatusError as e:
                 error_detail = e.response.text or str(e)
-                raise HTTPException(status_code=e.response.status_code, detail=f"Braincell server returned error: {error_detail}")
+                raise HTTPException(status_code=e.response.status_code, detail=f"bitnet server returned error: {error_detail}")
             except Exception as e:
                 raise HTTPException(status_code=500, detail=f"Unexpected error during chat: {str(e)}")
     return _chat
+
+class MultiChatRequest(BaseModel):
+    requests: List[ChatRequest]
+
+async def multichat_with_bitnet(multichat: MultiChatRequest):
+    async def run_chat(chat_req: ChatRequest):
+        chat_fn = chat_with_bitnet(chat_req)
+        return await chat_fn()
+    results = await asyncio.gather(*(run_chat(req) for req in multichat.requests), return_exceptions=True)
+    # Format results: if exception, return error message
+    formatted = []
+    for res in results:
+        if isinstance(res, Exception):
+            if isinstance(res, HTTPException):
+                formatted.append({"error": res.detail, "status_code": res.status_code})
+            else:
+                formatted.append({"error": str(res)})
+        else:
+            formatted.append(res)
+    return {"results": formatted}
diff --git a/app/main.py b/app/main.py
@@ -3,39 +3,59 @@
 from fastapi_mcp import FastApiMCP
 from lib.models import ModelEnum
 import lib.endpoints as endpoints
-from lib.endpoints import chat_with_braincell, ChatRequest
+from lib.endpoints import chat_with_bitnet, ChatRequest, multichat_with_bitnet, MultiChatRequest
+import traceback
 
 app = FastAPI()
 
-# Wrap with MCP for Model Context Protocol support
-mcp = FastApiMCP(app)
-
-# Mount the MCP server directly to your FastAPI app
-mcp.mount()
-
 @app.post("/initialize-server")
 async def initialize_server(
-    model: ModelEnum,
     threads: int = Query(os.cpu_count() // 2, gt=0, le=os.cpu_count()),
     ctx_size: int = Query(2048, gt=0),
     port: int = Query(8081, gt=1023),
     system_prompt: str = Query("You are a helpful assistant.", description="Unique system prompt for this server instance"),
     n_predict: int = Query(4096, gt=0, description="Number of tokens to predict for the server instance"),
     temperature: float = Query(0.8, gt=0.0, le=2.0, description="Temperature for sampling")
 ):
-    return await endpoints.initialize_server_endpoint(
-        model=model,
-        threads=threads,
-        ctx_size=ctx_size,
-        port=port,
-        system_prompt=system_prompt,
-        n_predict=n_predict,
-        temperature=temperature
-    )
+    try:
+        return await endpoints.initialize_server_endpoint(
+            threads=threads,
+            ctx_size=ctx_size,
+            port=port,
+            system_prompt=system_prompt,
+            n_predict=n_predict,
+            temperature=temperature
+        )
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+def _max_threads():
+    return os.cpu_count() or 1
+
+# --- Server Initialization and Shutdown Endpoints ---
+def validate_thread_allocation(requests):
+    max_threads = _max_threads()
+    total_requested = sum(req["threads"] for req in requests)
+    for req in requests:
+        if req["threads"] > max_threads:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Requested {req['threads']} threads for a server, but only {max_threads} are available."
+            )
+    if total_requested > max_threads:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Total requested threads ({total_requested}) exceed available threads ({max_threads})."
+        )
 
 @app.post("/shutdown-server")
 async def shutdown_server(port: int = Query(8081, gt=1023)):
-    return await endpoints.shutdown_server_endpoint(port=port)
+    try:
+        return await endpoints.shutdown_server_endpoint(port=port)
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
 
 @app.get("/server-status")
 async def server_status_endpoint(port: int = Query(8081, gt=1023)): # Renamed for clarity
@@ -48,7 +68,11 @@ async def benchmark(
     threads: int = Query(2, gt=0, le=os.cpu_count()),
     n_prompt: int = Query(32, gt=0)
 ):
-    return await endpoints.run_benchmark(model, n_token, threads, n_prompt)
+    try:
+        return await endpoints.run_benchmark(model, n_token, threads, n_prompt)
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
 
 @app.get("/perplexity")
 async def perplexity(
@@ -58,13 +82,35 @@ async def perplexity(
     ctx_size: int = Query(4, gt=0),
     ppl_stride: int = Query(0, ge=0)
 ):
-    return await endpoints.run_perplexity(model, prompt, threads, ctx_size, ppl_stride)
+    try:
+        return await endpoints.run_perplexity(model, prompt, threads, ctx_size, ppl_stride)
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
 
 @app.get("/model-sizes")
 def model_sizes():
     return endpoints.get_model_sizes()
 
 @app.post("/chat")
 async def chat(chat: ChatRequest):
-    chat_fn = chat_with_braincell(chat)
-    return await chat_fn()
+    try:
+        return await chat_with_bitnet(chat)
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+# Parallel multi-chat endpoint
+@app.post("/multichat")
+async def multichat(multichat: MultiChatRequest):
+    try:
+        return await multichat_with_bitnet(multichat)
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+# Wrap with MCP for Model Context Protocol support
+mcp = FastApiMCP(app)
+
+# Mount the MCP server directly to your FastAPI app
+mcp.mount()