@@ -14608,8 +14608,6 @@ static int llama_decode_internal(
14608
14608
14609
14609
const struct llama_hparams & hparams = model.hparams;
14610
14610
const int64_t kv_head = kv_self.head;
14611
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
14612
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
14613
14611
14614
14612
for (int i = 0; i < gf->n_nodes; i++) {
14615
14613
ggml_tensor * node = gf->nodes[i];
@@ -14619,6 +14617,7 @@ static int llama_decode_internal(
14619
14617
const char* k_prefix = "k_cache_view-";
14620
14618
if (strncmp(node->src[1]->name, k_prefix, strlen(k_prefix)) == 0) {
14621
14619
int il = atoi(node->src[1]->name + strlen(k_prefix)); // Layer index from name
14620
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
14622
14621
ggml_tensor * tmp_tensor = kv_self.k_l[il];
14623
14622
size_t tmp_offset = (ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa))*kv_head;
14624
14623
node->src[1]->data = static_cast<char*>(tmp_tensor->data) + tmp_offset;
@@ -14628,6 +14627,7 @@ static int llama_decode_internal(
14628
14627
const char* v_prefix = "v_cache_view-";
14629
14628
if (strncmp(node->src[1]->name, v_prefix, strlen(v_prefix)) == 0) {
14630
14629
int il = atoi(node->src[1]->name + strlen(v_prefix)); // Layer index from name
14630
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
14631
14631
ggml_tensor * tmp_tensor = kv_self.v_l[il];
14632
14632
size_t tmp_offset;
14633
14633
if (cparams.flash_attn) {
0 commit comments