SKaiNET-developers
diff --git a/‎skainet-apps/skainet-kllama/src/commonMain/kotlin/sk/ainet/apps/kllama/LlamaIngestion.kt‎
Lines changed: 2 additions & 4 deletions b/‎skainet-apps/skainet-kllama/src/commonMain/kotlin/sk/ainet/apps/kllama/LlamaIngestion.kt‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/cli/Main.kt‎
Lines changed: 16 additions & 49 deletions b/‎skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/cli/Main.kt‎
Lines changed: 16 additions & 49 deletions
diff --git a/‎skainet-apps/skainet-kllama/src/jvmTest/kotlin/sk/ainet/apps/kllama/LlamaIngestionTest.kt‎
Lines changed: 0 additions & 54 deletions b/‎skainet-apps/skainet-kllama/src/jvmTest/kotlin/sk/ainet/apps/kllama/LlamaIngestionTest.kt‎
Lines changed: 0 additions & 54 deletions
diff --git a/‎skainet-apps/skainet-kllama/src/jvmTest/kotlin/sk/ainet/apps/kllama/LlamaRuntimeTest.kt‎
Lines changed: 19 additions & 12 deletions b/‎skainet-apps/skainet-kllama/src/jvmTest/kotlin/sk/ainet/apps/kllama/LlamaRuntimeTest.kt‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎skainet-apps/skainet-kllama/src/nativeMain/kotlin/sk/ainet/apps/kllama/cli/Main.kt‎
Lines changed: 8 additions & 49 deletions b/‎skainet-apps/skainet-kllama/src/nativeMain/kotlin/sk/ainet/apps/kllama/cli/Main.kt‎
Lines changed: 8 additions & 49 deletions
@@ -7,11 +7,10 @@ import sk.ainet.io.gguf.llama.LlamaWeightLoader
 import sk.ainet.io.gguf.llama.loadLlamaRuntimeWeights
 
 /**
- * Thin facade around the GGUF/Karpathy loader that sets sensible defaults for the KLLama app.
+ * Thin facade around the GGUF loader that sets sensible defaults for the KLLama app.
  * Default policy dequantizes to FP32 to ensure parity before quant-aware kernels are wired.
  */
 public data class LlamaLoadConfig(
-    val format: LlamaWeightLoader.Format = LlamaWeightLoader.Format.GGUF,
     val quantPolicy: LlamaWeightLoader.QuantPolicy = LlamaWeightLoader.QuantPolicy.DEQUANTIZE_TO_FP32,
     val allowQuantized: Boolean = false
 )
@@ -21,7 +20,7 @@ public class LlamaIngestion(
     private val config: LlamaLoadConfig = LlamaLoadConfig()
 ) {
     /**
-     * Load LLaMA runtime weights from the provided source (GGUF by default).
+     * Load LLaMA runtime weights from the provided GGUF source.
      *
      * @throws IllegalStateException if metadata/tensors are missing or quantized tensors are present
      * when [config.allowQuantized] is false.
@@ -30,7 +29,6 @@ public class LlamaIngestion(
         return loadLlamaRuntimeWeights(
             ctx = ctx,
             sourceProvider = sourceProvider,
-            format = config.format,
             quantPolicy = config.quantPolicy,
             allowQuantized = config.allowQuantized
         )
 
@@ -13,14 +13,12 @@ import sk.ainet.apps.kllama.GGUFTokenizer
 import sk.ainet.apps.kllama.LlamaIngestion
 import sk.ainet.apps.kllama.LlamaLoadConfig
 import sk.ainet.apps.kllama.Tokenizer
-import sk.ainet.apps.kllama.TokenizerUtils
 import sk.ainet.apps.kllama.LlamaRuntime
 import sk.ainet.context.DirectCpuExecutionContext
 import sk.ainet.io.gguf.llama.LlamaWeightLoader
 
 private fun usage(): Nothing {
-    println("Usage: kllama <model-path> <prompt> [tokenizer-path] [steps=64] [temperature=0.8]")
-    println("       For GGUF models, tokenizer-path is optional (uses embedded tokenizer)")
+    println("Usage: kllama <model.gguf> <prompt> [steps=64] [temperature=0.8]")
     exitProcess(1)
 }
 
@@ -30,78 +28,47 @@ fun main(args: Array<String>) {
 
         val modelPath = Path.of(args[0])
         val prompt = args[1]
-
-        // Parse remaining args: tokenizer-path is optional for GGUF
-        var tokenizerPath: Path? = null
-        var steps = 64
-        var temperature = 0.8f
-
-        // Check if args[2] is a file path or a number (steps)
-        if (args.size > 2) {
-            val arg2 = args[2]
-            if (arg2.toIntOrNull() != null) {
-                // It's steps
-                steps = arg2.toInt()
-                temperature = args.getOrNull(3)?.toFloatOrNull() ?: 0.8f
-            } else {
-                // It's tokenizer path
-                tokenizerPath = Path.of(arg2)
-                steps = args.getOrNull(3)?.toIntOrNull() ?: 64
-                temperature = args.getOrNull(4)?.toFloatOrNull() ?: 0.8f
-            }
-        }
+        val steps = args.getOrNull(2)?.toIntOrNull() ?: 64
+        val temperature = args.getOrNull(3)?.toFloatOrNull() ?: 0.8f
 
         if (!modelPath.exists()) error("Model not found: $modelPath")
 
-        val format = when (modelPath.extension.lowercase()) {
-            "gguf" -> LlamaWeightLoader.Format.GGUF
-            "bin" -> LlamaWeightLoader.Format.KARPATHY_BIN
-            else -> error("Unknown model extension: ${modelPath.extension}. Use .gguf or .bin")
-        }
-
-        // For .bin format, tokenizer is required
-        if (format == LlamaWeightLoader.Format.KARPATHY_BIN && tokenizerPath == null) {
-            error("Tokenizer path is required for .bin format models")
-        }
-        if (tokenizerPath != null && !tokenizerPath.exists()) {
-            error("Tokenizer not found: $tokenizerPath")
+        if (modelPath.extension.lowercase() != "gguf") {
+            error("Only GGUF format is supported. Use a .gguf model file.")
         }
 
         val ctx = DirectCpuExecutionContext()
         val ingestion = LlamaIngestion(
             ctx = ctx,
             config = LlamaLoadConfig(
-                format = format,
                 quantPolicy = LlamaWeightLoader.QuantPolicy.DEQUANTIZE_TO_FP32,
                 allowQuantized = false
             )
         )
 
+        println("Loading model from $modelPath...")
         val runtimeWeights = ingestion.load {
             Files.newInputStream(modelPath).asSource().buffered()
         }
         val runtime = LlamaRuntime(ctx, runtimeWeights)
 
-        // Load tokenizer: use embedded GGUF tokenizer if no external path provided
-        val tokenizer: Tokenizer = if (tokenizerPath != null) {
-            loadTokenizer(tokenizerPath, runtimeWeights.metadata.vocabSize)
-        } else {
-            println("Using embedded GGUF tokenizer...")
-            GGUFTokenizer.fromSource(Files.newInputStream(modelPath).asSource().buffered())
-        }
+        // Load embedded GGUF tokenizer
+        println("Loading embedded GGUF tokenizer...")
+        val tokenizer: Tokenizer = GGUFTokenizer.fromSource(Files.newInputStream(modelPath).asSource().buffered())
 
         val promptTokens = tokenizer.encode(prompt)
 
+        println("Generating $steps tokens with temperature=$temperature...")
+        println("---")
+
         val elapsed = measureTime {
             runtime.generate(prompt = promptTokens, steps = steps, temperature = temperature) { id ->
                 print(tokenizer.decode(id))
             }
         }.inWholeMilliseconds
-        println("\n\ntok/s: ${steps / elapsed.toDouble() * 1000}")
-    }
-}
 
-private fun loadTokenizer(path: Path, vocabSize: Int): Tokenizer {
-    val source = Files.newInputStream(path).asSource().buffered()
-    return TokenizerUtils.buildTokenizer(source, vocabSize)
+        val tokPerSec = steps / elapsed.toDouble() * 1000
+        println("\n---")
+        println("tok/s: $tokPerSec")
+    }
 }
@@ -21,10 +21,16 @@ class LlamaRuntimeTest {
         val seqLen = 4
         val vocab = 3
 
+        // GGUF format shapes:
+        // - attention weights: [dim, dim] or [dim, kv_dim]
+        // - ffn gate/up: [dim, ff_dim]
+        // - ffn down: [ff_dim, dim]
+        // - token embedding: [dim, vocab]
+        // - output weight: [dim, vocab]
         val ones1d = ctx.full<FP32, Float>(Shape(dim), FP32::class, 1f)
         val ones2d = ctx.full<FP32, Float>(Shape(dim, dim), FP32::class, 0.25f)
-        val gate = ctx.full<FP32, Float>(Shape(hidden, dim), FP32::class, 0.1f)
-        val down = ctx.full<FP32, Float>(Shape(dim, hidden), FP32::class, 0.05f)
+        val gateUp = ctx.full<FP32, Float>(Shape(dim, hidden), FP32::class, 0.1f)  // [dim, ff_dim]
+        val down = ctx.full<FP32, Float>(Shape(hidden, dim), FP32::class, 0.05f)   // [ff_dim, dim]
         val ropeReal = ctx.full<FP32, Float>(Shape(seqLen, headSize / 2), FP32::class, 1f)
         val ropeImag = ctx.full<FP32, Float>(Shape(seqLen, headSize / 2), FP32::class, 0f)
 
@@ -35,9 +41,9 @@ class LlamaRuntimeTest {
             wv = ones2d,
             wo = ones2d,
             ffnNorm = ones1d,
-            ffnGate = gate,
+            ffnGate = gateUp,
             ffnDown = down,
-            ffnUp = gate
+            ffnUp = gateUp
         )
 
         val weights = LlamaRuntimeWeights(
@@ -52,12 +58,12 @@ class LlamaRuntimeTest {
                 ropeDimensionCount = headSize,
                 vocabSize = vocab
             ),
-            tokenEmbedding = ctx.full(Shape(vocab, dim), FP32::class, 0.2f),
+            tokenEmbedding = ctx.full(Shape(dim, vocab), FP32::class, 0.2f),  // [dim, vocab]
             ropeFreqReal = ropeReal,
             ropeFreqImag = ropeImag,
             layers = listOf(layer),
             outputNorm = ones1d,
-            outputWeight = ctx.full(Shape(vocab, dim), FP32::class, 0.3f)
+            outputWeight = ctx.full(Shape(dim, vocab), FP32::class, 0.3f)     // [dim, vocab]
         )
 
         val runtime = LlamaRuntime(ctx, weights)
@@ -75,10 +81,11 @@ class LlamaRuntimeTest {
         val seqLen = 6
         val vocab = 4
 
+        // GGUF format shapes
         val ones1d = ctx.full<FP32, Float>(Shape(dim), FP32::class, 1f)
         val ones2d = ctx.full<FP32, Float>(Shape(dim, dim), FP32::class, 0.1f)
-        val gate = ctx.full<FP32, Float>(Shape(hidden, dim), FP32::class, 0.05f)
-        val down = ctx.full<FP32, Float>(Shape(dim, hidden), FP32::class, 0.05f)
+        val gateUp = ctx.full<FP32, Float>(Shape(dim, hidden), FP32::class, 0.05f)  // [dim, ff_dim]
+        val down = ctx.full<FP32, Float>(Shape(hidden, dim), FP32::class, 0.05f)    // [ff_dim, dim]
         val ropeReal = ctx.full<FP32, Float>(Shape(seqLen, dim / 2), FP32::class, 1f)
         val ropeImag = ctx.full<FP32, Float>(Shape(seqLen, dim / 2), FP32::class, 0f)
 
@@ -89,9 +96,9 @@ class LlamaRuntimeTest {
             wv = ones2d,
             wo = ones2d,
             ffnNorm = ones1d,
-            ffnGate = gate,
+            ffnGate = gateUp,
             ffnDown = down,
-            ffnUp = gate
+            ffnUp = gateUp
         )
 
         val weights = LlamaRuntimeWeights(
@@ -106,12 +113,12 @@ class LlamaRuntimeTest {
                 ropeDimensionCount = dim,
                 vocabSize = vocab
             ),
-            tokenEmbedding = ctx.full(Shape(vocab, dim), FP32::class, 0.2f),
+            tokenEmbedding = ctx.full(Shape(dim, vocab), FP32::class, 0.2f),  // [dim, vocab]
             ropeFreqReal = ropeReal,
             ropeFreqImag = ropeImag,
             layers = listOf(layer),
             outputNorm = ones1d,
-            outputWeight = ctx.full(Shape(vocab, dim), FP32::class, 0.3f)
+            outputWeight = ctx.full(Shape(dim, vocab), FP32::class, 0.3f)     // [dim, vocab]
         )
 
         val runtime = LlamaRuntime(ctx, weights)
 
@@ -9,14 +9,12 @@ import sk.ainet.apps.kllama.GGUFTokenizer
 import sk.ainet.apps.kllama.LlamaIngestion
 import sk.ainet.apps.kllama.LlamaLoadConfig
 import sk.ainet.apps.kllama.Tokenizer
-import sk.ainet.apps.kllama.TokenizerUtils
 import sk.ainet.apps.kllama.LlamaRuntime
 import sk.ainet.context.DirectCpuExecutionContext
 import sk.ainet.io.gguf.llama.LlamaWeightLoader
 
 private fun usage(): Nothing {
-    println("Usage: kllama <model-path> <prompt> [tokenizer-path] [steps=64] [temperature=0.8]")
-    println("       For GGUF models, tokenizer-path is optional (uses embedded tokenizer)")
+    println("Usage: kllama <model.gguf> <prompt> [steps=64] [temperature=0.8]")
     throw IllegalArgumentException("Invalid arguments")
 }
 
@@ -25,52 +23,23 @@ fun main(args: Array<String>) = runBlocking {
 
     val modelPathStr = args[0]
     val prompt = args[1]
-
-    // Parse remaining args: tokenizer-path is optional for GGUF
-    var tokenizerPathStr: String? = null
-    var steps = 64
-    var temperature = 0.8f
-
-    // Check if args[2] is a file path or a number (steps)
-    if (args.size > 2) {
-        val arg2 = args[2]
-        if (arg2.toIntOrNull() != null) {
-            // It's steps
-            steps = arg2.toInt()
-            temperature = args.getOrNull(3)?.toFloatOrNull() ?: 0.8f
-        } else {
-            // It's tokenizer path
-            tokenizerPathStr = arg2
-            steps = args.getOrNull(3)?.toIntOrNull() ?: 64
-            temperature = args.getOrNull(4)?.toFloatOrNull() ?: 0.8f
-        }
-    }
+    val steps = args.getOrNull(2)?.toIntOrNull() ?: 64
+    val temperature = args.getOrNull(3)?.toFloatOrNull() ?: 0.8f
 
     val modelPath = Path(modelPathStr)
 
     if (!SystemFileSystem.exists(modelPath)) {
         error("Model not found: $modelPathStr")
     }
 
-    val modelFormat = when {
-        modelPathStr.endsWith(".gguf", ignoreCase = true) -> LlamaWeightLoader.Format.GGUF
-        modelPathStr.endsWith(".bin", ignoreCase = true) -> LlamaWeightLoader.Format.KARPATHY_BIN
-        else -> error("Unknown model extension. Use .gguf or .bin")
-    }
-
-    // For .bin format, tokenizer is required
-    if (modelFormat == LlamaWeightLoader.Format.KARPATHY_BIN && tokenizerPathStr == null) {
-        error("Tokenizer path is required for .bin format models")
-    }
-    if (tokenizerPathStr != null && !SystemFileSystem.exists(Path(tokenizerPathStr))) {
-        error("Tokenizer not found: $tokenizerPathStr")
+    if (!modelPathStr.endsWith(".gguf", ignoreCase = true)) {
+        error("Only GGUF format is supported. Use a .gguf model file.")
     }
 
     val ctx = DirectCpuExecutionContext()
     val ingestion = LlamaIngestion(
         ctx = ctx,
         config = LlamaLoadConfig(
-            format = modelFormat,
             quantPolicy = LlamaWeightLoader.QuantPolicy.DEQUANTIZE_TO_FP32,
             allowQuantized = false
         )
@@ -81,14 +50,9 @@ fun main(args: Array<String>) = runBlocking {
         SystemFileSystem.source(modelPath).buffered()
     }
 
-    // Load tokenizer: use embedded GGUF tokenizer if no external path provided
-    val tokenizer: Tokenizer = if (tokenizerPathStr != null) {
-        println("Loading tokenizer from $tokenizerPathStr...")
-        loadTokenizer(Path(tokenizerPathStr), runtimeWeights.metadata.vocabSize)
-    } else {
-        println("Using embedded GGUF tokenizer...")
-        GGUFTokenizer.fromSource(SystemFileSystem.source(modelPath).buffered())
-    }
+    // Load embedded GGUF tokenizer
+    println("Loading embedded GGUF tokenizer...")
+    val tokenizer: Tokenizer = GGUFTokenizer.fromSource(SystemFileSystem.source(modelPath).buffered())
 
     val runtime = LlamaRuntime(ctx, runtimeWeights)
     val promptTokens = tokenizer.encode(prompt)
@@ -106,8 +70,3 @@ fun main(args: Array<String>) = runBlocking {
     println("\n---")
     println("tok/s: $tokPerSec")
 }
-
-private fun loadTokenizer(path: Path, vocabSize: Int): Tokenizer {
-    val source = SystemFileSystem.source(path).buffered()
-    return TokenizerUtils.buildTokenizer(source, vocabSize)
-}