-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathintelligence_router.py
More file actions
157 lines (127 loc) · 5.42 KB
/
intelligence_router.py
File metadata and controls
157 lines (127 loc) · 5.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""Sovereign — Intelligence Router.
Hybrid local/cloud routing. Always tries local Ollama first.
Turbo (NIM) fires based on:
1. Onboarding phase turbo ratio (random chance — fades as bot learns)
2. Confidence below threshold
3. Message complexity above threshold
"Local first. Always. The turbo teaches, then fades." — Claude spec
"""
from __future__ import annotations
import asyncio
import logging
import random
log = logging.getLogger("sovereign.intelligence_router")
CONFIDENCE_THRESHOLD = 0.72 # below this → consider turbo
COMPLEXITY_THRESHOLD = 0.65 # above this → consider turbo
def _estimate_complexity(message: str) -> float:
"""Quick heuristic: longer + more technical words = higher complexity."""
words = message.split()
length_score = min(1.0, len(words) / 100)
tech_words = sum(1 for w in words if len(w) > 10)
tech_score = min(1.0, tech_words / 10)
return (length_score + tech_score) / 2
class BrainResult:
def __init__(self, text: str, source: str, confidence: float = 0.8):
self.text = text
self.source = source
self.confidence = confidence
class IntelligenceRouter:
"""Routes each message to local Ollama or NIM turbo based on phase + confidence.
Always calls local first — this is the sovereign brain.
Turbo is a detachable boost that distills into local memory and fades.
"""
def __init__(self, local_llm_fn, turbo_llm_fn, onboarding, distiller) -> None:
self._local = local_llm_fn # Ollama async fn(system, user) -> str
self._turbo = turbo_llm_fn # NIM async fn(system, user) -> str
self._onboarding = onboarding
self._distiller = distiller
self._local_calls = 0
self._turbo_calls = 0
async def route(
self,
user_id: str,
system: str,
user: str,
) -> BrainResult:
"""Route a prompt through local or turbo. Always tries local first."""
# Step 1 — Local attempt (always)
try:
local_text = await self._local(system=system, user=user)
self._local_calls += 1
except Exception as e:
log.error("Local LLM failed: %s — forcing turbo", e)
return await self._call_turbo(user_id, system, user, local_text="", local_conf=0.0)
# Step 2 — Turbo decision
turbo_ratio = self._onboarding.get_turbo_ratio(user_id)
complexity = _estimate_complexity(user)
# Estimate local confidence: longer, structured responses = higher confidence
words = len(local_text.split())
local_conf = min(0.95, 0.4 + (words / 200))
should_turbo = self._should_turbo(local_conf, complexity, turbo_ratio)
if not should_turbo:
log.debug("Local handled it (conf=%.2f ratio=%.2f)", local_conf, turbo_ratio)
return BrainResult(text=local_text, source="local", confidence=local_conf)
# Step 3 — Turbo fires
return await self._call_turbo(user_id, system, user, local_text, local_conf)
async def _call_turbo(
self,
user_id: str,
system: str,
user: str,
local_text: str,
local_conf: float,
) -> BrainResult:
try:
turbo_text = await self._turbo(system=system, user=user)
self._turbo_calls += 1
boost_conf = 0.85
# Strip any model watermarks from cloud output
turbo_text = _strip_invisible(turbo_text)
# Distill into memory — teach the local model what it didn't know
if local_text:
self._distiller.learn(
question=user,
local_text=local_text,
boosted_text=turbo_text,
local_confidence=local_conf,
boost_confidence=boost_conf,
user_id=user_id,
)
log.info(
"Turbo fired (ratio=%.2f local_conf=%.2f) | "
"local=%d turbo=%d session",
self._onboarding.get_turbo_ratio(user_id), local_conf,
self._local_calls, self._turbo_calls,
)
return BrainResult(text=turbo_text, source="nvidia_boost", confidence=boost_conf * 0.85)
except Exception as e:
log.error("Turbo failed: %s — falling back to local", e)
return BrainResult(text=local_text or "I'm having trouble right now.",
source="local_fallback", confidence=0.3)
def _should_turbo(self, local_conf: float, complexity: float, turbo_ratio: float) -> bool:
# Phase-based random chance (fades as bot learns)
if random.random() < turbo_ratio:
return True
# Low local confidence
if local_conf < CONFIDENCE_THRESHOLD:
return True
# High complexity
if complexity > COMPLEXITY_THRESHOLD:
return True
return False
@property
def stats(self) -> dict:
total = self._local_calls + self._turbo_calls
return {
"local_calls": self._local_calls,
"turbo_calls": self._turbo_calls,
"turbo_rate": f"{100 * self._turbo_calls // total}%" if total else "0%",
}
_INVISIBLE_CHARS = [
"\u200b", "\u200c", "\u200d", "\u200e", "\u200f",
"\u2060", "\ufeff", "\u00ad", "\u034f", "\u180e",
]
def _strip_invisible(text: str) -> str:
for ch in _INVISIBLE_CHARS:
text = text.replace(ch, "")
return text