memory : improve status handling

ggerganov · ggerganov · commit 503dda289d5a · 2025-06-04T10:34:22.000+03:00
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -451,7 +451,7 @@ bool llama_context::kv_self_update(bool optimize) {
 
         const auto kv_state = kv_self->init_update(this, optimize);
         if (kv_state->get_status() == LLAMA_MEMORY_STATUS_NO_UPDATE) {
-            // no updates have been performed
+            // no updates need to be performed
             return false;
         }
 
@@ -979,6 +979,7 @@ int llama_context::decode(llama_batch & inp_batch) {
             case LLAMA_MEMORY_STATUS_NO_UPDATE:
                 {
                     LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, kv_state->get_status());
+
                     return -2;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
@@ -993,12 +994,14 @@ int llama_context::decode(llama_batch & inp_batch) {
                         }
                     }
 
-                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+                    LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, batch.n_tokens);
 
                     return 1;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
                 {
+                    LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, batch.n_tokens);
+
                     return -2;
                 }
         }
diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp
@@ -1,6 +1,7 @@
 #include "llama-kv-cache-recurrent.h"
 
 #include "llama-impl.h"
+#include "llama-io.h"
 #include "llama-batch.h"
 #include "llama-model.h"
 
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
@@ -167,6 +167,8 @@ llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
         llama_kv_cache_unified_iswa * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS) {
     state_base = kv->get_base()->init_full();
     state_swa  = kv->get_swa ()->init_full();
+
+    status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
 }
 
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
@@ -176,22 +178,7 @@ llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
     state_base = kv->get_base()->init_update(lctx, optimize);
     state_swa  = kv->get_swa ()->init_update(lctx, optimize);
 
-    // TODO: this is very ugly - how to make it simpler?
-    //       the llama_memory_status enum is not very well designed
-    if (state_base->get_status() != LLAMA_MEMORY_STATUS_SUCCESS && state_base->get_status() != LLAMA_MEMORY_STATUS_NO_UPDATE) {
-        status = state_base->get_status();
-        return;
-    }
-
-    if (state_swa->get_status() != LLAMA_MEMORY_STATUS_SUCCESS && state_swa->get_status() != LLAMA_MEMORY_STATUS_NO_UPDATE) {
-        status = state_swa->get_status();
-        return;
-    }
-
-    if (state_base->get_status() == LLAMA_MEMORY_STATUS_NO_UPDATE && state_swa->get_status() == LLAMA_MEMORY_STATUS_NO_UPDATE) {
-        status = LLAMA_MEMORY_STATUS_NO_UPDATE;
-        return;
-    }
+    status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
 }
 
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
@@ -200,13 +187,15 @@ llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
         std::vector<uint32_t> heads_base,
         std::vector<uint32_t> heads_swa,
         std::vector<llama_ubatch> ubatches)
-    : status(LLAMA_MEMORY_STATUS_SUCCESS),
-    sbatch(std::move(sbatch)),
-    ubatches(std::move(ubatches)) {
-        // note: here we copy the ubatches. not sure if this is ideal
-        state_base.reset(new llama_kv_cache_unified_state(kv->get_base(), {}, std::move(heads_base), this->ubatches));
-        state_swa .reset(new llama_kv_cache_unified_state(kv->get_swa (), {}, std::move(heads_swa),  this->ubatches));
-    }
+        : status(LLAMA_MEMORY_STATUS_SUCCESS),
+        sbatch(std::move(sbatch)),
+        ubatches(std::move(ubatches)) {
+    // note: here we copy the ubatches. not sure if this is ideal
+    state_base.reset(new llama_kv_cache_unified_state(kv->get_base(), {}, std::move(heads_base), this->ubatches));
+    state_swa .reset(new llama_kv_cache_unified_state(kv->get_swa (), {}, std::move(heads_swa),  this->ubatches));
+
+    status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
+}
 
 llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
 
@@ -246,6 +235,7 @@ llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
 
 const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
     return ubatches[i_next];
 }
 
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -1,6 +1,7 @@
 #include "llama-kv-cache-unified.h"
 
 #include "llama-impl.h"
+#include "llama-io.h"
 #include "llama-model.h"
 #include "llama-context.h"
 
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -1,9 +1,11 @@
 #pragma once
 
 #include "llama.h"
-#include "llama-io.h"
 #include "llama-memory.h"
 
+class llama_io_write_i;
+class llama_io_read_i;
+
 struct llama_kv_cache : public llama_memory_i {
     virtual ~llama_kv_cache() = default;
 
diff --git a/src/llama-memory.cpp b/src/llama-memory.cpp
@@ -1 +1,42 @@
 #include "llama-memory.h"
+
+llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1) {
+    bool has_update = false;
+
+    switch (s0) {
+        case LLAMA_MEMORY_STATUS_SUCCESS:
+            {
+                has_update = true;
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_NO_UPDATE:
+            {
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+            {
+                return s0;
+            }
+    }
+
+    switch (s1) {
+        case LLAMA_MEMORY_STATUS_SUCCESS:
+            {
+                has_update = true;
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_NO_UPDATE:
+            {
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+            {
+                return s1;
+            }
+    }
+
+    // if either status has an update, then the combined status has an update
+    return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
+}
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -45,6 +45,10 @@ enum llama_memory_status {
     LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
 };
 
+// helper function for combining the status of two memory states
+// useful for implementing hybrid memory types (e.g. iSWA)
+llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
+
 // the interface for managing the memory state during batch processing
 // this interface is implemented per memory type. see:
 //   - llama_kv_cache_unified_state
@@ -72,7 +76,7 @@ class llama_memory_state_i {
     // get the current ubatch
     virtual const llama_ubatch & get_ubatch() const = 0;
 
-    // get the status of the memory state
+    // get the status of the memory state - used for error handling and checking if any updates would be applied
     virtual llama_memory_status get_status() const = 0;
 };
 

Original file line number	Diff line number	Diff line change
`@@ -451,7 +451,7 @@ bool llama_context::kv_self_update(bool optimize) {`
`451`	`451`
`452`	`452`	`const auto kv_state = kv_self->init_update(this, optimize);`
`453`	`453`	`if (kv_state->get_status() == LLAMA_MEMORY_STATUS_NO_UPDATE) {`
`454`		`- // no updates have been performed`
	`454`	`+ // no updates need to be performed`
`455`	`455`	`return false;`
`456`	`456`	`}`
`457`	`457`
`@@ -979,6 +979,7 @@ int llama_context::decode(llama_batch & inp_batch) {`
`979`	`979`	`case LLAMA_MEMORY_STATUS_NO_UPDATE:`
`980`	`980`	`{`
`981`	`981`	`LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, kv_state->get_status());`
	`982`	`+`
`982`	`983`	`return -2;`
`983`	`984`	`}`
`984`	`985`	`case LLAMA_MEMORY_STATUS_FAILED_PREPARE:`
`@@ -993,12 +994,14 @@ int llama_context::decode(llama_batch & inp_batch) {`
`993`	`994`	`}`
`994`	`995`	`}`
`995`	`996`
`996`		`- LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);`
	`997`	`+ LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, batch.n_tokens);`
`997`	`998`
`998`	`999`	`return 1;`
`999`	`1000`	`}`
`1000`	`1001`	`case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:`
`1001`	`1002`	`{`
	`1003`	`+ LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, batch.n_tokens);`
	`1004`	`+`
`1002`	`1005`	`return -2;`
`1003`	`1006`	`}`
`1004`	`1007`	`}`