SovereignBot/intelligence_router.py at main · NovasPlace/SovereignBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""Sovereign — Intelligence Router.

Hybrid local/cloud routing. Always tries local Ollama first.
Turbo (NIM) fires based on:
  1. Onboarding phase turbo ratio (random chance — fades as bot learns)
  2. Confidence below threshold
  3. Message complexity above threshold

"Local first. Always. The turbo teaches, then fades." — Claude spec
"""
from __future__ import annotations

import asyncio
import logging
import random

log = logging.getLogger("sovereign.intelligence_router")

CONFIDENCE_THRESHOLD = 0.72    # below this → consider turbo
COMPLEXITY_THRESHOLD = 0.65    # above this → consider turbo


def _estimate_complexity(message: str) -> float:
    """Quick heuristic: longer + more technical words = higher complexity."""
    words = message.split()
    length_score = min(1.0, len(words) / 100)
    tech_words = sum(1 for w in words if len(w) > 10)
    tech_score = min(1.0, tech_words / 10)
    return (length_score + tech_score) / 2


class BrainResult:
    def __init__(self, text: str, source: str, confidence: float = 0.8):
        self.text = text
        self.source = source
        self.confidence = confidence


class IntelligenceRouter:
    """Routes each message to local Ollama or NIM turbo based on phase + confidence.

    Always calls local first — this is the sovereign brain.
    Turbo is a detachable boost that distills into local memory and fades.
    """

    def __init__(self, local_llm_fn, turbo_llm_fn, onboarding, distiller) -> None:
        self._local = local_llm_fn       # Ollama async fn(system, user) -> str
        self._turbo = turbo_llm_fn       # NIM async fn(system, user) -> str
        self._onboarding = onboarding
        self._distiller = distiller
        self._local_calls = 0
        self._turbo_calls = 0

    async def route(
        self,
        user_id: str,
        system: str,
        user: str,
    ) -> BrainResult:
        """Route a prompt through local or turbo. Always tries local first."""

        # Step 1 — Local attempt (always)
        try:
            local_text = await self._local(system=system, user=user)
            self._local_calls += 1
        except Exception as e:
            log.error("Local LLM failed: %s — forcing turbo", e)
            return await self._call_turbo(user_id, system, user, local_text="", local_conf=0.0)

        # Step 2 — Turbo decision
        turbo_ratio = self._onboarding.get_turbo_ratio(user_id)
        complexity = _estimate_complexity(user)

        # Estimate local confidence: longer, structured responses = higher confidence
        words = len(local_text.split())
        local_conf = min(0.95, 0.4 + (words / 200))

        should_turbo = self._should_turbo(local_conf, complexity, turbo_ratio)

        if not should_turbo:
            log.debug("Local handled it (conf=%.2f ratio=%.2f)", local_conf, turbo_ratio)
            return BrainResult(text=local_text, source="local", confidence=local_conf)

        # Step 3 — Turbo fires
        return await self._call_turbo(user_id, system, user, local_text, local_conf)

    async def _call_turbo(
        self,
        user_id: str,
        system: str,
        user: str,
        local_text: str,
        local_conf: float,
    ) -> BrainResult:
        try:
            turbo_text = await self._turbo(system=system, user=user)
            self._turbo_calls += 1
            boost_conf = 0.85

            # Strip any model watermarks from cloud output
            turbo_text = _strip_invisible(turbo_text)

            # Distill into memory — teach the local model what it didn't know
            if local_text:
                self._distiller.learn(
                    question=user,
                    local_text=local_text,
                    boosted_text=turbo_text,
                    local_confidence=local_conf,
                    boost_confidence=boost_conf,
                    user_id=user_id,
                )

            log.info(
                "Turbo fired (ratio=%.2f local_conf=%.2f) | "
                "local=%d turbo=%d session",
                self._onboarding.get_turbo_ratio(user_id), local_conf,
                self._local_calls, self._turbo_calls,
            )
            return BrainResult(text=turbo_text, source="nvidia_boost", confidence=boost_conf * 0.85)

        except Exception as e:
            log.error("Turbo failed: %s — falling back to local", e)
            return BrainResult(text=local_text or "I'm having trouble right now.",
                               source="local_fallback", confidence=0.3)

    def _should_turbo(self, local_conf: float, complexity: float, turbo_ratio: float) -> bool:
        # Phase-based random chance (fades as bot learns)
        if random.random() < turbo_ratio:
            return True
        # Low local confidence
        if local_conf < CONFIDENCE_THRESHOLD:
            return True
        # High complexity
        if complexity > COMPLEXITY_THRESHOLD:
            return True
        return False

    @property
    def stats(self) -> dict:
        total = self._local_calls + self._turbo_calls
        return {
            "local_calls": self._local_calls,
            "turbo_calls": self._turbo_calls,
            "turbo_rate": f"{100 * self._turbo_calls // total}%" if total else "0%",
        }


_INVISIBLE_CHARS = [
    "\u200b", "\u200c", "\u200d", "\u200e", "\u200f",
    "\u2060", "\ufeff", "\u00ad", "\u034f", "\u180e",
]

def _strip_invisible(text: str) -> str:
    for ch in _INVISIBLE_CHARS:
        text = text.replace(ch, "")
    return text