beehive-lab · mikepapadim · Jul 31, 2025 · Jun 17, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 <strong>Llama3</strong> models written in <strong>native Java</strong> automatically accelerated on GPUs with <a href="https://github.com/beehive-lab/TornadoVM" target="_blank"><strong>TornadoVM</strong></a>.
 Runs Llama3 inference efficiently using TornadoVM's GPU acceleration.
 <br><br>
-Currently, supports <strong>Llama3</strong> and <strong>Mistral</strong> models in the GGUF format.
+Currently, supports <strong>Llama3</strong>, <strong>Mistral</strong> and, <strong>Qwen3</strong> models in the GGUF format.
 <br><br>
 Builds on <a href="https://github.com/mukel/llama3.java">Llama3.java</a> by <a href="https://github.com/mukel">Alfonso² Peterssen</a>.
 Previous integration of TornadoVM and Llama2 it can be found in <a href="https://github.com/mikepapadim/llama2.tornadovm.java">llama2.tornadovm</a>.
@@ -187,6 +187,7 @@ llama-tornado --gpu --model beehive-llama-3.2-1b-instruct-fp16.gguf --prompt "te
     -Dtornado.load.tornado.implementation=uk.ac.manchester.tornado.runtime.common.Tornado \
     -Dtornado.load.annotation.implementation=uk.ac.manchester.tornado.annotation.ASMClassVisitor \
     -Dtornado.load.annotation.parallel=uk.ac.manchester.tornado.api.annotations.Parallel \
+    -Dtornado.tvm.maxbytecodesize=65536 \
     -Duse.tornadovm=true \
     -Dtornado.threadInfo=false \
     -Dtornado.debug=false \
@@ -237,6 +238,12 @@ Download `FP16` quantized `Llama-3` .gguf files from:
 Download `FP16` quantized `Mistral` .gguf files from:
 - https://huggingface.co/collections/beehive-lab/mistral-gpullama3java-684afabb206136d2e9cd47e0
 
+Download `FP16` quantized `Qwen3` .gguf files from:
+- https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF
+- https://huggingface.co/ggml-org/Qwen3-1.7B-GGUF
+- https://huggingface.co/ggml-org/Qwen3-4B-GGUF
+- https://huggingface.co/ggml-org/Qwen3-8B-GGUF
+
 Please be gentle with [huggingface.co](https://huggingface.co) servers:
 
 **Note** FP16 models are first-class citizens for the current version.
@@ -252,6 +259,18 @@ wget https://huggingface.co/beehive-lab/Llama-3.2-8B-Instruct-GGUF-FP16/resolve/
 
 # Mistral (7B) - FP16
 wget https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.fp16.gguf
+
+# Qwen3 (0.6B) - FP16
+wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-f16.gguf
+
+# Qwen3 (1.7B) - FP16
+wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-1.7B-f16.gguf
+
+# Qwen3 (4B) - FP16
+wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-4B-f16.gguf
+
+# Qwen3 (8B) - FP16
+wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-8B-f16.gguf
 ```
 
 **[Experimental]** you can download the Q8 and Q4 used in the original implementation of Llama3.java, but for now are going to be dequanted to FP16 for TornadoVM support:

diff --git a/external/tornadovm b/external/tornadovm
diff --git a/llama-tornado b/llama-tornado
@@ -75,6 +75,7 @@ class LlamaRunner:
             "-Dtornado.load.tornado.implementation=uk.ac.manchester.tornado.runtime.common.Tornado",
             "-Dtornado.load.annotation.implementation=uk.ac.manchester.tornado.annotation.ASMClassVisitor",
             "-Dtornado.load.annotation.parallel=uk.ac.manchester.tornado.api.annotations.Parallel",
+            "-Dtornado.tvm.maxbytecodesize=65536"
         ]
         cmd.extend(tornado_config)
 

diff --git a/src/main/java/com/example/LlamaApp.java b/src/main/java/com/example/LlamaApp.java
@@ -5,7 +5,7 @@
 import com.example.inference.sampler.CategoricalSampler;
 import com.example.inference.sampler.Sampler;
 import com.example.inference.sampler.ToppSampler;
-import com.example.loader.weights.ModelLoader;
+import com.example.model.loader.ModelLoader;
 import com.example.model.Model;
 import com.example.tornadovm.FloatArrayUtils;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;

diff --git a/src/main/java/com/example/aot/AOT.java b/src/main/java/com/example/aot/AOT.java
@@ -3,11 +3,13 @@
 import com.example.auxiliary.Timer;
 import com.example.core.model.GGUF;
 import com.example.core.model.tensor.GGMLTensorEntry;
+import com.example.model.loader.LlamaModelLoader;
 import com.example.model.Model;
 import com.example.Options;
+import com.example.model.format.LlamaChatFormat;
 import com.example.model.llama.Llama;
-import com.example.loader.weights.ModelLoader;
-import com.example.loader.weights.Weights;
+import com.example.inference.weights.Weights;
+import com.example.tokenizer.impl.LlamaTokenizer;
 
 import java.io.IOException;
 import java.nio.channels.FileChannel;
@@ -28,8 +30,10 @@
 public final class AOT {
     AOT.PartialModel preLoaded = AOT.PRELOADED_GGUF;
 
+    static LlamaModelLoader modelLoader;
 
-    record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {}
+    record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {
+    }
 
     private static final PartialModel PRELOADED_GGUF = preLoadGGUF(System.getProperty("llama.PreloadGGUF"));
 
@@ -44,12 +48,9 @@ private static PartialModel preLoadGGUF(String modelPath) {
             }
             GGUF gguf = GGUF.loadModel(path);
             try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
-                return new PartialModel(
-                        path.getFileName().toString(),
-                        Llama.loadModel(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false), // TODO: needs proper handling for AOT
-                        gguf.getTensorDataOffset(),
-                        gguf.getTensorInfos()
-                );
+                modelLoader = new LlamaModelLoader(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false);
+                return new PartialModel(path.getFileName().toString(), modelLoader.loadModel(), // TODO: needs proper handling for AOT
+                        gguf.getTensorDataOffset(), gguf.getTensorInfos());
             }
         } catch (IOException e) {
             throw new RuntimeException(e);
@@ -73,12 +74,11 @@ public static Model tryUsePreLoaded(Path modelPath, int contextLength) throws IO
             return null;
         }
         Llama baseModel = preLoaded.model();
-        try (var timer = Timer.log("Load tensors from pre-loaded model");
-                var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
+        try (var timer = Timer.log("Load tensors from pre-loaded model"); var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
             // Load only the tensors (mmap slices).
             Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, preLoaded.tensorDataOffset(), preLoaded.tensorInfos());
-            Weights weights = ModelLoader.loadWeights(tensorEntries, baseModel.configuration());
-            return new Llama(baseModel.configuration().withContextLength(contextLength), baseModel.tokenizer(), weights);
+            Weights weights = modelLoader.loadWeights(tensorEntries, baseModel.configuration());
+            return new Llama(baseModel.configuration().withContextLength(contextLength), baseModel.tokenizer(), weights, new LlamaChatFormat((LlamaTokenizer) baseModel.tokenizer()));
         }
     }
 }

diff --git a/src/main/java/com/example/auxiliary/Utf8Mask.java b/src/main/java/com/example/auxiliary/Utf8Mask.java
@@ -0,0 +1,12 @@
+package com.example.auxiliary;
+
+/** mask of a byte-sequence in UTF-8 encoding */
+public record Utf8Mask(int mask, int pattern, int len) {
+    //@formatter:off
+    public static final Utf8Mask[] MASKS = {
+            new Utf8Mask(0b11100000, 0b11000000, 2),
+            new Utf8Mask(0b11110000, 0b11100000, 3),
+            new Utf8Mask(0b11111000, 0b11110000, 4)
+    };
+    //@formatter:on
+}
diff --git a/src/main/java/com/example/core/model/tensor/ArrayFloatTensor.java b/src/main/java/com/example/core/model/tensor/ArrayFloatTensor.java
@@ -13,7 +13,7 @@ public final class ArrayFloatTensor extends FloatTensor {
 
     final float[] values;
 
-    ArrayFloatTensor(float[] values) {
+    public ArrayFloatTensor(float[] values) {
         this.values = values;
     }
 

diff --git a/src/main/java/com/example/core/model/tensor/F32FloatTensor.java b/src/main/java/com/example/core/model/tensor/F32FloatTensor.java
@@ -0,0 +1,48 @@
+package com.example.core.model.tensor;
+
+import com.example.core.model.GGMLType;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorSpecies;
+
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+
+public final class F32FloatTensor extends FloatTensor {
+    final int size;
+    final MemorySegment segment;
+
+    public F32FloatTensor(int size, MemorySegment segment) {
+        this.size = size;
+        this.segment = segment;
+    }
+
+    @Override
+    public int size() {
+        return size;
+    }
+
+    @Override
+    public GGMLType type() {
+        return GGMLType.F32;
+    }
+
+    @Override
+    public MemorySegment asMemorySegment() {
+        return null;
+    }
+
+    @Override
+    public float getFloat(int index) {
+        return segment.get(ValueLayout.OfFloat.JAVA_FLOAT, index * Float.BYTES);
+    }
+
+    @Override
+    public void setFloat(int index, float value) {
+        segment.set(ValueLayout.OfFloat.JAVA_FLOAT, index * Float.BYTES, value);
+    }
+
+    @Override
+    protected FloatVector getFloatVector(VectorSpecies<Float> species, int offset) {
+        throw new UnsupportedOperationException("getFloatVector is not yet implemented.");
+    }
+}