Skip to content

Commit 3251097

Browse files
committed
Sync llama : remove llama_kv_cache_view API + remove deprecated and kv-cache : simplify the interface
1 parent 35ccfa0 commit 3251097

File tree

1 file changed

+11
-276
lines changed

1 file changed

+11
-276
lines changed

llama_cpp/llama_cpp.py

Lines changed: 11 additions & 276 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,152 +1773,34 @@ def llama_apply_adapter_cvec(
17731773
# //
17741774

17751775

1776-
# // Information associated with an individual cell in the KV cache view.
1777-
# struct llama_kv_cache_view_cell {
1778-
# // The position for this cell. Takes KV cache shifts into account.
1779-
# // May be negative if the cell is not populated.
1780-
# llama_pos pos;
1781-
# };
1782-
class llama_kv_cache_view_cell(ctypes.Structure):
1783-
"""Information associated with an individual cell in the KV cache view.
1784-
1785-
Attributes:
1786-
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
1787-
May be negative if the cell is not populated."""
1788-
1789-
if TYPE_CHECKING:
1790-
pos: llama_pos
1791-
1792-
_fields_ = [("pos", llama_pos)]
1793-
1794-
1795-
# // An updateable view of the KV cache.
1796-
# struct llama_kv_cache_view {
1797-
# // Number of KV cache cells. This will be the same as the context size.
1798-
# int32_t n_cells;
1799-
1800-
# // Maximum number of sequences that can exist in a cell. It's not an error
1801-
# // if there are more sequences in a cell than this value, however they will
1802-
# // not be visible in the view cells_sequences.
1803-
# int32_t n_seq_max;
1804-
1805-
# // Number of tokens in the cache. For example, if there are two populated
1806-
# // cells, the first with 1 sequence id in it and the second with 2 sequence
1807-
# // ids then you'll have 3 tokens.
1808-
# int32_t token_count;
1809-
1810-
# // Number of populated cache cells.
1811-
# int32_t used_cells;
1812-
1813-
# // Maximum contiguous empty slots in the cache.
1814-
# int32_t max_contiguous;
1815-
1816-
# // Index to the start of the max_contiguous slot range. Can be negative
1817-
# // when cache is full.
1818-
# int32_t max_contiguous_idx;
1819-
1820-
# // Information for an individual cell.
1821-
# struct llama_kv_cache_view_cell * cells;
1822-
1823-
1824-
# // The sequences for each cell. There will be n_seq_max items per cell.
1825-
# llama_seq_id * cells_sequences;
1826-
# };
1827-
class llama_kv_cache_view(ctypes.Structure):
1828-
if TYPE_CHECKING:
1829-
n_cells: int
1830-
n_max_seq: int
1831-
token_count: int
1832-
used_cells: int
1833-
max_contiguous: int
1834-
max_contiguous_idx: int
1835-
cells: CtypesArray[llama_kv_cache_view_cell]
1836-
cells_sequences: CtypesArray[llama_seq_id]
1837-
1838-
_fields_ = [
1839-
("n_cells", ctypes.c_int32),
1840-
("n_max_seq", ctypes.c_int32),
1841-
("token_count", ctypes.c_int32),
1842-
("used_cells", ctypes.c_int32),
1843-
("max_contiguous", ctypes.c_int32),
1844-
("max_contiguous_idx", ctypes.c_int32),
1845-
("cells", ctypes.POINTER(llama_kv_cache_view_cell)),
1846-
("cells_sequences", ctypes.POINTER(llama_seq_id)),
1847-
]
1848-
1849-
1850-
llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view)
1851-
1852-
1853-
# // Create an empty KV cache view. (use only for debugging purposes)
1854-
# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
1855-
@ctypes_function(
1856-
"llama_kv_cache_view_init",
1857-
[llama_context_p_ctypes, ctypes.c_int32],
1858-
llama_kv_cache_view,
1859-
)
1860-
def llama_kv_cache_view_init(
1861-
ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
1862-
) -> llama_kv_cache_view:
1863-
"""Create an empty KV cache view. (use only for debugging purposes)"""
1864-
...
1865-
1866-
1867-
# // Free a KV cache view. (use only for debugging purposes)
1868-
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
1869-
@ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None)
1870-
def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /): # type: ignore
1871-
"""Free a KV cache view. (use only for debugging purposes)"""
1872-
...
1873-
1874-
1875-
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
1876-
# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
1877-
@ctypes_function(
1878-
"llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None
1879-
)
1880-
def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /): # type: ignore
1881-
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
1882-
...
1883-
1884-
18851776
# // Returns the number of tokens in the KV cache (slow, use only for debug)
18861777
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1887-
# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
1778+
# DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
1779+
# "Use llama_kv_self_seq_pos_max() instead");
18881780
@ctypes_function(
18891781
"llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
18901782
)
18911783
def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
1892-
"""Returns the number of tokens in the KV cache (slow, use only for debug)
1893-
If a KV cell has multiple sequences assigned to it, it will be counted multiple times
18941784
"""
1895-
...
1896-
1897-
# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
1898-
# "use llama_kv_self_n_tokens instead");
1899-
@ctypes_function(
1900-
"llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
1901-
)
1902-
def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
1785+
DEPRECATED
1786+
Use llama_kv_self_seq_pos_max() instead
1787+
"""
19031788
...
19041789

19051790

19061791
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
1907-
# LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
1792+
# DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
1793+
# "Use llama_kv_self_seq_pos_max() instead");
19081794
@ctypes_function(
19091795
"llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
19101796
)
19111797
def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
1912-
"""Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1798+
"""
1799+
DEPRECATED
1800+
Use llama_kv_self_seq_pos_max() instead
1801+
"""
19131802
...
19141803

1915-
# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
1916-
# "use llama_kv_self_used_cells instead");
1917-
@ctypes_function(
1918-
"llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32
1919-
)
1920-
def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
1921-
...
19221804

19231805
# // Clear the KV cache - both cell info is erased and KV data is zeroed
19241806
# LLAMA_API void llama_kv_self_clear(
@@ -1928,12 +1810,6 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
19281810
"""Clear the KV cache"""
19291811
...
19301812

1931-
# DEPRECATED(LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx),
1932-
# "use llama_kv_self_clear instead");
1933-
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
1934-
def llama_kv_cache_clear(ctx: llama_context_p, /):
1935-
"""Clear the KV cache"""
1936-
...
19371813

19381814
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
19391815
# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -1972,32 +1848,6 @@ def llama_kv_self_seq_rm(
19721848
...
19731849

19741850

1975-
# DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
1976-
# struct llama_context * ctx,
1977-
# llama_seq_id seq_id,
1978-
# llama_pos p0,
1979-
# llama_pos p1),
1980-
# "use llama_kv_self_seq_rm instead");
1981-
@ctypes_function(
1982-
"llama_kv_cache_seq_rm",
1983-
[
1984-
llama_context_p_ctypes,
1985-
llama_seq_id,
1986-
llama_pos,
1987-
llama_pos,
1988-
],
1989-
ctypes.c_bool,
1990-
)
1991-
def llama_kv_cache_seq_rm(
1992-
ctx: llama_context_p,
1993-
seq_id: Union[llama_seq_id, int],
1994-
p0: Union[llama_pos, int],
1995-
p1: Union[llama_pos, int],
1996-
/,
1997-
) -> bool:
1998-
...
1999-
2000-
20011851
# // Copy all tokens that belong to the specified sequence to another sequence
20021852
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
20031853
# // p0 < 0 : [0, p1]
@@ -2033,34 +1883,6 @@ def llama_kv_self_seq_cp(
20331883
p1 < 0 : [p0, inf)"""
20341884
...
20351885

2036-
# DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
2037-
# struct llama_context * ctx,
2038-
# llama_seq_id seq_id_src,
2039-
# llama_seq_id seq_id_dst,
2040-
# llama_pos p0,
2041-
# llama_pos p1),
2042-
# "use llama_kv_self_seq_cp instead");
2043-
@ctypes_function(
2044-
"llama_kv_cache_seq_cp",
2045-
[
2046-
llama_context_p_ctypes,
2047-
llama_seq_id,
2048-
llama_seq_id,
2049-
llama_pos,
2050-
llama_pos,
2051-
],
2052-
None,
2053-
)
2054-
def llama_kv_cache_seq_cp(
2055-
ctx: llama_context_p,
2056-
seq_id_src: Union[llama_seq_id, int],
2057-
seq_id_dst: Union[llama_seq_id, int],
2058-
p0: Union[llama_pos, int],
2059-
p1: Union[llama_pos, int],
2060-
/,
2061-
):
2062-
...
2063-
20641886

20651887
# // Removes all tokens that do not belong to the specified sequence
20661888
# LLAMA_API void llama_kv_self_seq_keep(
@@ -2073,17 +1895,6 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
20731895
"""Removes all tokens that do not belong to the specified sequence"""
20741896
...
20751897

2076-
# DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
2077-
# struct llama_context * ctx,
2078-
# llama_seq_id seq_id),
2079-
# "use llama_kv_self_seq_keep instead");
2080-
@ctypes_function(
2081-
"llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
2082-
)
2083-
def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
2084-
"""Removes all tokens that do not belong to the specified sequence"""
2085-
...
2086-
20871898

20881899
# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
20891900
# // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -2124,34 +1935,6 @@ def llama_kv_self_seq_add(
21241935
p1 < 0 : [p0, inf)"""
21251936
...
21261937

2127-
# DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
2128-
# struct llama_context * ctx,
2129-
# llama_seq_id seq_id,
2130-
# llama_pos p0,
2131-
# llama_pos p1,
2132-
# llama_pos delta),
2133-
# "use llama_kv_self_seq_add instead");
2134-
@ctypes_function(
2135-
"llama_kv_cache_seq_add",
2136-
[
2137-
llama_context_p_ctypes,
2138-
llama_seq_id,
2139-
llama_pos,
2140-
llama_pos,
2141-
llama_pos,
2142-
],
2143-
None,
2144-
)
2145-
def llama_kv_cache_seq_add(
2146-
ctx: llama_context_p,
2147-
seq_id: Union[llama_seq_id, int],
2148-
p0: Union[llama_pos, int],
2149-
p1: Union[llama_pos, int],
2150-
delta: Union[llama_pos, int],
2151-
/,
2152-
):
2153-
...
2154-
21551938

21561939
# // Integer division of the positions by factor of `d > 1`
21571940
# // If the KV cache is RoPEd, the KV data is updated accordingly
@@ -2189,35 +1972,6 @@ def llama_kv_self_seq_div(
21891972
...
21901973

21911974

2192-
# DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
2193-
# struct llama_context * ctx,
2194-
# llama_seq_id seq_id,
2195-
# llama_pos p0,
2196-
# llama_pos p1,
2197-
# int d),
2198-
# "use llama_kv_self_seq_div instead");
2199-
@ctypes_function(
2200-
"llama_kv_cache_seq_div",
2201-
[
2202-
llama_context_p_ctypes,
2203-
llama_seq_id,
2204-
llama_pos,
2205-
llama_pos,
2206-
ctypes.c_int,
2207-
],
2208-
None,
2209-
)
2210-
def llama_kv_cache_seq_div(
2211-
ctx: llama_context_p,
2212-
seq_id: Union[llama_seq_id, int],
2213-
p0: Union[llama_pos, int],
2214-
p1: Union[llama_pos, int],
2215-
d: Union[ctypes.c_int, int],
2216-
/,
2217-
):
2218-
...
2219-
2220-
22211975
# // Returns the smallest position present in the KV cache for the specified sequence
22221976
# // This is typically non-zero only for SWA caches
22231977
# // Return -1 if the sequence is empty
@@ -2273,26 +2027,13 @@ def llama_kv_self_defrag(ctx: llama_context_p, /):
22732027
...
22742028

22752029

2276-
# DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
2277-
# "use llama_kv_self_defrag instead");
2278-
@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
2279-
def llama_kv_cache_defrag(ctx: llama_context_p, /):
2280-
...
2281-
2282-
22832030
# // Check if the context supports KV cache shifting
22842031
# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
22852032
@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
22862033
def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
22872034
"""Check if the context supports KV cache shifting"""
22882035
...
22892036

2290-
# DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
2291-
# "use llama_kv_self_can_shift instead");
2292-
@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
2293-
def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
2294-
...
2295-
22962037

22972038
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
22982039
# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
@@ -2301,12 +2042,6 @@ def llama_kv_self_update(ctx: llama_context_p, /):
23012042
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
23022043
...
23032044

2304-
# DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
2305-
# "use llama_kv_self_update instead");
2306-
@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
2307-
def llama_kv_cache_update(ctx: llama_context_p, /):
2308-
...
2309-
23102045

23112046
# //
23122047
# // State / sessions

0 commit comments

Comments
 (0)