Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions solo_server/commands/aid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import typer
from litgpt import LLM
from rich.console import Console

console = Console()

def query_llm(query: str):
# Check if the query exceeds 9000 characters
if len(query) > 9000:
typer.echo("Error: Your query exceeds the maximum allowed length of 9000 characters. It's over 9000!")
raise typer.Exit(1)

# Load the model and generate a response while showing a spinner
llm = LLM.load("Qwen/Qwen2.5-1.5B-Instruct")
with console.status("Generating response...", spinner="dots"):
response = llm.generate(query)
typer.echo(response)

def interactive_mode():
console.print("Interactive Mode (type 'exit' or 'quit' to end):", style="bold green")
while True:
query_text = input(">> ")
if query_text.lower() in ("exit", "quit"):
break
query_llm(query_text)

if __name__ == "__main__":
# If invoked with "@@" as the first argument, treat the rest as the query.
# Otherwise, launch interactive mode.
if len(sys.argv) > 1 and sys.argv[1] == "@@":
if len(sys.argv) > 2:
query_text = " ".join(sys.argv[2:])
else:
typer.echo("Enter your query (end with EOF / Ctrl-D):")
query_text = sys.stdin.read().strip()
query_llm(query_text)
else:
interactive_mode()
46 changes: 46 additions & 0 deletions solo_server/solo.ensemble.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
system_information:
operating_system: "Windows"
cpu: "AMD64 Family 23 Model 96 Stepping 1, AuthenticAMD"
cpu_cores: 8
memory: "15.42GB"
gpu:
vendor: "NVIDIA"
model: "NVIDIA GeForce GTX 1660 Ti"
memory: "6144MB"
compute_backend: "CUDA"

server_options:
- name: "Ollama"
recommended: true
details: "Optimized for systems with NVIDIA GPUs and CUDA support. (Recommended for your system.)"
- name: "vLLM"
recommended: false
details: "High-performance inference engine, best suited for Linux environments."
- name: "Llama.cpp"
recommended: false
details: "Lightweight and cross-platform; runs efficiently on CPU-only systems."
- name: "LitGPT"
recommended: false
details: "Lightnight AI based PyTorch implementation."

default_server: "Ollama"

models:
solo-core-model:
model: "Qwen/Qwen2.5-1.5B-Instruct"
description: "Primary general-purpose model."
coding:
model: "qwen2.5-3b-coder"
description: "Optimized for code generation and programming tasks."
chat:
model: "deepseekr1-instruct-distill"
description: "Fine-tuned for conversational and chat applications."
robots:
model: "ottonomy-distill"
description: "Targeted for robotics and automation-related tasks."
healthcare_classification:
model: "palm"
description: "Optimized for healthcare data classification and analysis."
general:
model: "Qwen/Qwen2.5-1.5B-Instruct"
description: "Primary general-purpose model."