feat(skainet-cli): swap LLaMA/Qwen branch to DSL path

michalharakal · claude · michalharakal · commit 4219088538d8 · 2026-05-04T20:06:36.000+02:00
Phase 5b consumer migration. Mirrors PR #122 (kllama CLI) and #123 (KLlamaJava facade). After this merge, no top-level CLI in this repo constructs `LlamaRuntime` for the GGUF path. `skainet-cli` previously routed Gemma + Apertus through DSL but kept LLaMA / Qwen / Mistral on the legacy `LlamaRuntime` + `CpuAttentionBackend` + `LlamaWeightMapper` + `MemSegWeightConverter` chain. This PR collapses the else branch onto the DSL path: - `DecoderGgufWeightLoader(NATIVE_OPTIMIZED, family.architectures + [arch])` → `DecoderGgufMemSegConverter.convert` → per-family network loader → `OptimizedLLMRuntime` DIRECT mode. - Family dispatch on the DSL side: `ModelFamily.QWEN` → `QwenNetworkLoader.fromWeights` (NEOX RoPE + QK-norm), else → `LlamaNetworkLoader.fromWeights`. Previously this CLI handled Qwen via the `LlamaRuntime`-with-detected-flags hybrid that the kllama CLI also used pre-#121 — same architectural collapse here. Imports cleaned: removed `CpuAttentionBackend`, `LlamaRuntime`, `LlamaWeightMapper`, `MemSegWeightConverter`. Added `:llm-inference:qwen` to the build.gradle dependencies (was missing — only the legacy hybrid-Qwen path didn't need it). Numerical equivalence with the legacy path on identical weights is pinned by `QwenDslLegacyParityTest` (#120). Tests pass: `:llm-apps:skainet-cli:build`, `:llm-runtime:kllama:jvmTest`, `:llm-inference:qwen:jvmTest`, `:llm-inference:llama:jvmTest`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/llm-apps/skainet-cli/build.gradle.kts b/llm-apps/skainet-cli/build.gradle.kts
@@ -20,6 +20,7 @@ dependencies {
 
     // Inference modules (for network loaders)
     implementation(project(":llm-inference:llama"))
+    implementation(project(":llm-inference:qwen"))
     implementation(project(":llm-inference:gemma"))
     implementation(project(":llm-inference:apertus"))
 
diff --git a/llm-apps/skainet-cli/src/main/kotlin/sk/ainet/apps/skainet/cli/Main.kt b/llm-apps/skainet-cli/src/main/kotlin/sk/ainet/apps/skainet/cli/Main.kt
@@ -1,6 +1,5 @@
 package sk.ainet.apps.skainet.cli
 
-import sk.ainet.apps.kllama.CpuAttentionBackend
 import sk.ainet.apps.kllama.cli.AgentCli
 import sk.ainet.apps.kllama.cli.ToolCallingDemo
 import sk.ainet.apps.llm.InferenceRuntime
@@ -24,10 +23,10 @@ import sk.ainet.io.JvmRandomAccessSource
 import sk.ainet.io.model.QuantPolicy
 import sk.ainet.lang.tensor.data.MemorySegmentTensorDataFactory
 import sk.ainet.lang.types.FP32
-import sk.ainet.models.llama.LlamaRuntime
+import sk.ainet.models.llama.DecoderGgufMemSegConverter
 import sk.ainet.models.llama.DecoderGgufWeightLoader
-import sk.ainet.models.llama.LlamaWeightMapper
-import sk.ainet.models.llama.MemSegWeightConverter
+import sk.ainet.models.llama.LlamaNetworkLoader
+import sk.ainet.models.qwen.QwenNetworkLoader
 import java.lang.foreign.Arena
 import java.nio.file.Path
 import kotlin.io.path.exists
@@ -164,15 +163,18 @@ fun main(args: Array<String>) {
             memSegFactory.close()
         })
 
-        // Load model based on detected family. Gemma and Apertus route
-        // through the DSL pipeline (their respective network() builder +
-        // OptimizedLLMRuntime); everything else (LLaMA, Qwen, ...) takes
-        // the LlamaRuntime path which supports NATIVE_OPTIMIZED quant
-        // tensors for low-RAM loads. Apertus had previously fallen
-        // through to the LlamaRuntime branch — that runtime doesn't
-        // implement Apertus's xIELU activation, QK-Norm, or ungated FFN,
-        // so logits silently diverged from the checkpoint's intent. See
-        // APERTUS_ROLLOUT.md (PR 1) for the rollout context.
+        // Load model based on detected family. All families route through
+        // the DSL pipeline (per-family network() builder +
+        // OptimizedLLMRuntime). The legacy LlamaRuntime path was retired
+        // for the kllama CLI in #121 / #122; this CLI follows in this PR.
+        // Numerical equivalence with the legacy path on identical weights
+        // is pinned by `QwenDslLegacyParityTest` (#120).
+        //
+        // Apertus had previously fallen through to the LlamaRuntime
+        // branch — that runtime doesn't implement Apertus's xIELU
+        // activation, QK-Norm, or ungated FFN, so logits silently
+        // diverged from the checkpoint's intent. The DSL path is correct
+        // for Apertus too. See APERTUS_ROLLOUT.md (PR 1).
         val runtime: InferenceRuntime<FP32> = if (modelInfo.family == ModelFamily.GEMMA) {
             println("Loading Gemma GGUF model from $modelPath via gemmaNetwork() + OptimizedLLMRuntime (NATIVE_OPTIMIZED)...")
             if (cliArgs.contextLength != null) {
@@ -197,38 +199,44 @@ fun main(args: Array<String>) {
             ).load<FP32, Float>(ctx)
             OptimizedLLMRuntime(model, ctx, OptimizedLLMMode.DIRECT, FP32::class)
         } else {
+            // LLaMA / Qwen / Mistral DSL path. DecoderGgufWeightLoader
+            // streams the GGUF, DecoderGgufMemSegConverter wraps Q4_0/Q8_0
+            // tensors as packed MemorySegment data, then the per-family
+            // network loader builds the right module:
+            //   - Qwen → qwenNetwork() (QK-norm + NEOX RoPE)
+            //   - else → llamaNetwork() (LLaMA / Mistral default)
             val acceptedArchitectures = modelInfo.family.architectures + setOf(modelInfo.architecture)
             val loader = DecoderGgufWeightLoader(
                 randomAccessProvider = { JvmRandomAccessSource.open(modelPath.toString()) },
                 quantPolicy = QuantPolicy.NATIVE_OPTIMIZED,
-                acceptedArchitectures = acceptedArchitectures
+                acceptedArchitectures = acceptedArchitectures,
             )
 
-            println("Loading GGUF model from $modelPath (${modelInfo.family.displayName}, streaming)...")
-            val loaded = loader.loadToMapStreaming<FP32, Float>(ctx, FP32::class)
-            val rawWeights = LlamaWeightMapper.map(loaded)
+            println("Loading GGUF model from $modelPath (${modelInfo.family.displayName}, DSL streaming)...")
+            val rawWeights = loader.loadToMapStreaming<FP32, Float>(ctx)
 
-            val runtimeWeights = if (rawWeights.quantTypes.isNotEmpty()) {
+            val convertedWeights = if (rawWeights.quantTypes.isNotEmpty()) {
                 println("Converting ${rawWeights.quantTypes.size} quantized tensors to SIMD format...")
-                MemSegWeightConverter.convert(rawWeights, ctx, quantArena)
+                DecoderGgufMemSegConverter.convert(rawWeights, ctx, quantArena)
             } else {
                 rawWeights
             }
 
             if (cliArgs.contextLength != null) {
-                println("Context length capped to ${cliArgs.contextLength} (model default: ${runtimeWeights.metadata.contextLength})")
+                println("Context length capped to ${cliArgs.contextLength} (model default: ${convertedWeights.metadata.contextLength})")
             }
 
-            val backend = CpuAttentionBackend<FP32>(
-                ctx, runtimeWeights, FP32::class,
-                ropeFreqBase = runtimeWeights.metadata.ropeFreqBase,
-                maxContextLength = cliArgs.contextLength
-            )
-
-            @Suppress("DEPRECATION")
-            LlamaRuntime<FP32>(
-                ctx, runtimeWeights, backend, FP32::class,
-                eps = runtimeWeights.metadata.rmsNormEps
+            val model = if (modelInfo.family == ModelFamily.QWEN) {
+                QwenNetworkLoader.fromWeights(convertedWeights)
+            } else {
+                LlamaNetworkLoader.fromWeights(convertedWeights)
+            }
+            OptimizedLLMRuntime(
+                model = model,
+                ctx = ctx,
+                mode = OptimizedLLMMode.DIRECT,
+                dtype = FP32::class,
+                bos = convertedWeights.metadata.bosTokenId,
             )
         }