Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-and-run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:

- name: Clone TornadoVM explicitly
run: |
git clone --depth 1 --branch develop \
git clone --depth 1 --branch master \
https://github.com/beehive-lab/TornadoVM.git \
GPULlama3.java/external/tornadovm
- name: Set up Python venv for TornadoVM
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ public static FloatArray forwardTornadoVM(Model model, State state, int token, i
final Configuration configuration = model.configuration();
final TornadoWeights weights = (TornadoWeights) model.weights();

MemorySegment.copy(weights.getTokenEmbeddingTable().asFloatArray().getSegment(), (long) token * configuration.dim() * Float.BYTES, state.wrapX.getSegment(), 0, configuration.dim() * Float.BYTES);
MemorySegment.copy(weights.getTokenEmbeddingTable().asHalfFloatArray().getSegment(), (long) token * configuration.dim() * Short.BYTES, state.embeddingX.getSegment(), 0, configuration.dim() * Short.BYTES);

return tornadoVMMasterPlan.tornadoVMForwardExecuteLayered(position);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.beehive.gpullama3.tensor.standard.FloatTensor;
import org.beehive.gpullama3.model.Configuration;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
import uk.ac.manchester.tornado.api.types.arrays.IntArray;

import java.util.stream.Stream;
Expand Down Expand Up @@ -52,6 +53,8 @@ protected StateFields createStateFields(Configuration config) {
fields.wrapHb = new FloatArray(config.hiddenDim());
fields.wrapHb2 = new FloatArray(config.hiddenDim());

fields.embeddingX = new HalfFloatArray(config.dim());

fields.wrapLogits = new FloatArray(config.vocabularySize());
fields.wrapQ = new FloatArray(config.dim());
fields.wrapK = new FloatArray(config.dim());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.beehive.gpullama3.model.Configuration;
import org.beehive.gpullama3.model.phi3.Phi3Configuration;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
import uk.ac.manchester.tornado.api.types.arrays.IntArray;

import java.util.stream.Stream;
Expand Down Expand Up @@ -79,6 +80,7 @@ protected StateFields createStateFields(Configuration config) {
fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(contextLength, kvDim)).limit(nLayers).toArray(FloatTensor[]::new);

// TornadoVM wrapper arrays for GPU acceleration
fields.embeddingX = new HalfFloatArray(config.dim());
fields.wrapX = new FloatArray(dim);
fields.wrapXb = new FloatArray(dim);
fields.wrapXb2 = new FloatArray(dim);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.beehive.gpullama3.model.Configuration;
import org.beehive.gpullama3.model.qwen2.Qwen2Configuration;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
import uk.ac.manchester.tornado.api.types.arrays.IntArray;

import java.util.stream.Stream;
Expand Down Expand Up @@ -40,6 +41,7 @@ protected StateFields createStateFields(Configuration configuration) {
fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa)).limit(config.numberOfLayers()).toArray(FloatTensor[]::new);

// TornadoVM wrappers with Qwen2 dimensions
fields.embeddingX = new HalfFloatArray(config.dim());
fields.wrapX = new FloatArray(config.dim());
fields.wrapXb = new FloatArray(config.dim());
fields.wrapXb2 = new FloatArray(config.dim());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.beehive.gpullama3.model.Configuration;
import org.beehive.gpullama3.model.qwen3.Qwen3Configuration;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
import uk.ac.manchester.tornado.api.types.arrays.IntArray;

import java.util.stream.Stream;
Expand Down Expand Up @@ -65,6 +66,8 @@ protected StateFields createStateFields(Configuration configuration) {
fields.valueCache = Stream.generate(() -> ArrayFloatTensor.allocate(config.contextLength(), nEmbdGqa)).limit(config.numberOfLayers()).toArray(FloatTensor[]::new);

// TornadoVM wrappers with Qwen3-specific sizes

fields.embeddingX = new HalfFloatArray(config.dim());
fields.wrapX = new FloatArray(config.dim());
fields.wrapXb = new FloatArray(nEmbdHeadK * config.numberOfHeads());
fields.wrapXb2 = new FloatArray(config.dim());
Expand All @@ -74,7 +77,7 @@ protected StateFields createStateFields(Configuration configuration) {
fields.wrapQ = new FloatArray(nEmbdHeadK * config.numberOfHeads());
fields.wrapK = new FloatArray(nEmbdKGqa);
fields.wrapV = new FloatArray(nEmbdKGqa);

fields.embeddingX = new HalfFloatArray(config.dim());
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate field initialization. The field fields.embeddingX is initialized twice - once at line 70 and again at line 80 with the same value. The second initialization should be removed.

Suggested change
fields.embeddingX = new HalfFloatArray(config.dim());
// Removed duplicate initialization of fields.embeddingX

Copilot uses AI. Check for mistakes.
fields.wrapKeyCache = new FloatArray(config.contextLength() * nEmbdGqa * config.numberOfLayers());
fields.wrapValueCache = new FloatArray(config.contextLength() * nEmbdGqa * config.numberOfLayers());
fields.wrapValueCache.init(0.f);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import org.beehive.gpullama3.tensor.standard.FloatTensor;
import org.beehive.gpullama3.model.Configuration;
import uk.ac.manchester.tornado.api.types.HalfFloat;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
import uk.ac.manchester.tornado.api.types.arrays.IntArray;

/**
Expand Down Expand Up @@ -57,6 +59,7 @@ public abstract class State {
public final FloatArray wrapValueCache; // FloatArray wrapper for the value cache, optimized for TornadoVM.
public final IntArray positionHolder;

public HalfFloatArray embeddingX;
// store inter
public int localSize;
public FloatArray temp; // Temporary buffer for intermediate calculations, size adjusted for local workgroup size.
Expand Down Expand Up @@ -88,6 +91,7 @@ protected State(Configuration config, int batchsize) {
this.keyCache = fields.keyCache;
this.valueCache = fields.valueCache;

this.embeddingX = fields.embeddingX;
this.wrapX = fields.wrapX;
this.wrapXb = fields.wrapXb;
this.wrapXb2 = fields.wrapXb2;
Expand Down Expand Up @@ -121,6 +125,7 @@ protected static class StateFields {
public FloatArray wrapQ, wrapK, wrapV, wrapAtt, wrapKeyCache, wrapValueCache;
public IntArray positionHolder;
public FloatArray temp, tempFFN, tempLogits;
public HalfFloatArray embeddingX;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr

// Load all tensors uniformly as TornadoTensor hierarchy
return new LlamaTornadoWeights(
loadTornadoTensorAsFP32(tokenEmbeddings),
loadTornadoTensor(tokenEmbeddings),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")), // fp32
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr

// Load all tensors uniformly as TornadoTensor hierarchy
return new LlamaTornadoWeights(
loadTornadoTensorAsFP32(tokenEmbeddings),
loadTornadoTensor(tokenEmbeddings),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")), // fp32
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr

// Load all tensors uniformly as TornadoTensor hierarchy
return new Phi3TornadoWeights(
loadTornadoTensorAsFP32(tokenEmbeddings),
loadTornadoTensor(tokenEmbeddings),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")), // fp32
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_qkv.weight")),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_output.weight")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr

// Load all tensors uniformly as TornadoTensor hierarchy
return new Qwen2TornadoWeights(
loadTornadoTensorAsFP32(tokenEmbeddings),
loadTornadoTensor(tokenEmbeddings),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")), // fp32
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr
final int nl = config.numberOfLayers();

return new Qwen3TornadoWeights(
loadTornadoTensorAsFP32(tokenEmbeddings),
loadTornadoTensor(tokenEmbeddings),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_norm.weight")), // fp32
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_q.weight")),
loadArrayOfTornadoTensors(nl, i -> tensorEntries.get("blk." + i + ".attn_k.weight")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import uk.ac.manchester.tornado.api.KernelContext;
import uk.ac.manchester.tornado.api.math.TornadoMath;
import uk.ac.manchester.tornado.api.types.HalfFloat;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;

public class TransformerComputeKernels {

Expand All @@ -19,6 +21,18 @@ public static void emptyTaskToForceCopyIn(FloatArray buffer) {
}
}

public static void convertFP16toFP32(KernelContext context, HalfFloatArray x, FloatArray wrapX) {
int i = context.globalIdx;
wrapX.set(i, x.get(i).getFloat32());
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing bounds check in kernel. The kernel should validate that context.globalIdx is within the valid range of both arrays before accessing them to prevent out-of-bounds access. Add a check like if (i < x.getSize() && i < wrapX.getSize()).

Suggested change
wrapX.set(i, x.get(i).getFloat32());
if (i < x.getSize() && i < wrapX.getSize()) {
wrapX.set(i, x.get(i).getFloat32());
}

Copilot uses AI. Check for mistakes.
}

public static void convertFP32toFP16(KernelContext context, FloatArray wrapX, HalfFloatArray x) {
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra whitespace after parameter. There are two spaces between the comma and FloatArray - should be one space.

Suggested change
public static void convertFP32toFP16(KernelContext context, FloatArray wrapX, HalfFloatArray x) {
public static void convertFP32toFP16(KernelContext context, FloatArray wrapX, HalfFloatArray x) {

Copilot uses AI. Check for mistakes.
int i = context.globalIdx;
float valInput = wrapX.get(i);
HalfFloat val = new HalfFloat(valInput);
x.set(i,val);
Comment on lines +26 to +33
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing bounds check in kernel. The kernel should validate that context.globalIdx is within the valid range of both arrays before accessing them to prevent out-of-bounds access. Add a check like if (i < wrapX.getSize() && i < x.getSize()).

Suggested change
wrapX.set(i, x.get(i).getFloat32());
}
public static void convertFP32toFP16(KernelContext context, FloatArray wrapX, HalfFloatArray x) {
int i = context.globalIdx;
float valInput = wrapX.get(i);
HalfFloat val = new HalfFloat(valInput);
x.set(i,val);
if (i < wrapX.getSize() && i < x.getSize()) {
wrapX.set(i, x.get(i).getFloat32());
}
}
public static void convertFP32toFP16(KernelContext context, FloatArray wrapX, HalfFloatArray x) {
int i = context.globalIdx;
if (i < wrapX.getSize() && i < x.getSize()) {
float valInput = wrapX.get(i);
HalfFloat val = new HalfFloat(valInput);
x.set(i, val);
}

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space after comma. Add a space after the comma for consistency with code style.

Suggested change
x.set(i,val);
x.set(i, val);

Copilot uses AI. Check for mistakes.
}

/**
* Performs RMS (Root Mean Square) normalization using parallel reduction.
* This is a two-phase reduction: first within work groups, then across work groups.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
import org.beehive.gpullama3.tornadovm.layerplanner.WorkerGridFactory;
import uk.ac.manchester.tornado.api.GridScheduler;
import uk.ac.manchester.tornado.api.ImmutableTaskGraph;
import uk.ac.manchester.tornado.api.KernelContext;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.WorkerGrid;
import uk.ac.manchester.tornado.api.WorkerGrid1D;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;

public class Activation extends AbstractLayer {
Expand All @@ -17,16 +19,20 @@ public class Activation extends AbstractLayer {
public Activation(String taskGraphHandle, State state, Weights weights, Configuration config) {
super(taskGraphHandle, state, weights, config);

// formatter:off
this.activationUpdate = new TaskGraph(taskGraphHandle).transferToDevice(DataTransferMode.EVERY_EXECUTION, state.wrapX)
.task("updateX", TransformerComputeKernels::emptyTaskToForceCopyIn, state.wrapX).persistOnDevice(state.wrapX);
// formatter:on
KernelContext kernelContext = new KernelContext();
// @formatter:off
this.activationUpdate = new TaskGraph(taskGraphHandle)
.transferToDevice(DataTransferMode.EVERY_EXECUTION, state.embeddingX)
.task("updateX", TransformerComputeKernels::convertFP16toFP32, kernelContext, state.embeddingX, state.wrapX)
.persistOnDevice(state.wrapX);
// @formatter:on
}

@Override
public GridScheduler updateGridScheduler(GridScheduler scheduler) {
WorkerGrid singleWorker = WorkerGridFactory.createSingleWorker();
scheduler.addWorkerGrid("activationUpdate.updateX", singleWorker);
WorkerGrid worker = new WorkerGrid1D(config.dim());
worker.setLocalWork(128, 1, 1);
scheduler.addWorkerGrid("activationUpdate.updateX", worker);
return scheduler;
}

Expand Down
Loading