Fix some crash bug!

JamePeng · JamePeng · commit 246e3dace0b4 · 2025-05-17T01:20:17.000+08:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -522,6 +522,7 @@ def set_batch(self, batch: Sequence[int], n_past: int):
             self.batch.pos[i] = n_past + i
             self.batch.seq_id[i][0] = 0
             self.batch.n_seq_id[i] = 1
+        self.batch.logits[n_tokens - 1] = True
 
     def add_sequence(self, batch: Sequence[int], seq_id: int):
         n_tokens = len(batch)
@@ -533,6 +534,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int):
             self.batch.pos[j] = i
             self.batch.seq_id[j][0] = seq_id
             self.batch.n_seq_id[j] = 1
+        self.batch.logits[n_tokens - 1] = True
 
 
 class LlamaTokenDataArray:
@@ -983,7 +985,7 @@ def get_seed(self) -> int:
         assert self.sampler is not None
         return llama_cpp.llama_sampler_get_seed(self.sampler)
 
-    def sample(self, ctx: LlamaContext, idx: int) -> int:
+    def sample(self, ctx: LlamaContext, idx: ctypes.c_int32) -> ctypes.c_int32:
         assert self.sampler is not None
         assert ctx.ctx is not None
         return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx)
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -2681,9 +2681,9 @@ def llama_batch_get_one(
     "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch
 )
 def llama_batch_init(
-    n_tokens: Union[ctypes.c_int32, int],
-    embd: Union[ctypes.c_int32, int],
-    n_seq_max: Union[ctypes.c_int32, int],
+    n_tokens: ctypes.c_int32,
+    embd: ctypes.c_int32,
+    n_seq_max: ctypes.c_int32,
     /,
 ) -> llama_batch:
     """Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
@@ -2872,10 +2872,10 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_logits_ith(
-    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
-) -> CtypesArray[ctypes.c_float]:
+    ctx: llama_context_p, i: ctypes.c_int32, /
+) -> ctypes.POINTER(ctypes.c_float):
     """Logits for the ith token. Equivalent to:
-    llama_get_logits(ctx) + i*n_vocab"""
+    llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab"""
     ...