@@ -1773,152 +1773,34 @@ def llama_apply_adapter_cvec(
1773
1773
# //
1774
1774
1775
1775
1776
- # // Information associated with an individual cell in the KV cache view.
1777
- # struct llama_kv_cache_view_cell {
1778
- # // The position for this cell. Takes KV cache shifts into account.
1779
- # // May be negative if the cell is not populated.
1780
- # llama_pos pos;
1781
- # };
1782
- class llama_kv_cache_view_cell (ctypes .Structure ):
1783
- """Information associated with an individual cell in the KV cache view.
1784
-
1785
- Attributes:
1786
- pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
1787
- May be negative if the cell is not populated."""
1788
-
1789
- if TYPE_CHECKING :
1790
- pos : llama_pos
1791
-
1792
- _fields_ = [("pos" , llama_pos )]
1793
-
1794
-
1795
- # // An updateable view of the KV cache.
1796
- # struct llama_kv_cache_view {
1797
- # // Number of KV cache cells. This will be the same as the context size.
1798
- # int32_t n_cells;
1799
-
1800
- # // Maximum number of sequences that can exist in a cell. It's not an error
1801
- # // if there are more sequences in a cell than this value, however they will
1802
- # // not be visible in the view cells_sequences.
1803
- # int32_t n_seq_max;
1804
-
1805
- # // Number of tokens in the cache. For example, if there are two populated
1806
- # // cells, the first with 1 sequence id in it and the second with 2 sequence
1807
- # // ids then you'll have 3 tokens.
1808
- # int32_t token_count;
1809
-
1810
- # // Number of populated cache cells.
1811
- # int32_t used_cells;
1812
-
1813
- # // Maximum contiguous empty slots in the cache.
1814
- # int32_t max_contiguous;
1815
-
1816
- # // Index to the start of the max_contiguous slot range. Can be negative
1817
- # // when cache is full.
1818
- # int32_t max_contiguous_idx;
1819
-
1820
- # // Information for an individual cell.
1821
- # struct llama_kv_cache_view_cell * cells;
1822
-
1823
-
1824
- # // The sequences for each cell. There will be n_seq_max items per cell.
1825
- # llama_seq_id * cells_sequences;
1826
- # };
1827
- class llama_kv_cache_view (ctypes .Structure ):
1828
- if TYPE_CHECKING :
1829
- n_cells : int
1830
- n_max_seq : int
1831
- token_count : int
1832
- used_cells : int
1833
- max_contiguous : int
1834
- max_contiguous_idx : int
1835
- cells : CtypesArray [llama_kv_cache_view_cell ]
1836
- cells_sequences : CtypesArray [llama_seq_id ]
1837
-
1838
- _fields_ = [
1839
- ("n_cells" , ctypes .c_int32 ),
1840
- ("n_max_seq" , ctypes .c_int32 ),
1841
- ("token_count" , ctypes .c_int32 ),
1842
- ("used_cells" , ctypes .c_int32 ),
1843
- ("max_contiguous" , ctypes .c_int32 ),
1844
- ("max_contiguous_idx" , ctypes .c_int32 ),
1845
- ("cells" , ctypes .POINTER (llama_kv_cache_view_cell )),
1846
- ("cells_sequences" , ctypes .POINTER (llama_seq_id )),
1847
- ]
1848
-
1849
-
1850
- llama_kv_cache_view_p = ctypes .POINTER (llama_kv_cache_view )
1851
-
1852
-
1853
- # // Create an empty KV cache view. (use only for debugging purposes)
1854
- # LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
1855
- @ctypes_function (
1856
- "llama_kv_cache_view_init" ,
1857
- [llama_context_p_ctypes , ctypes .c_int32 ],
1858
- llama_kv_cache_view ,
1859
- )
1860
- def llama_kv_cache_view_init (
1861
- ctx : llama_context_p , n_seq_max : Union [ctypes .c_int32 , int ], /
1862
- ) -> llama_kv_cache_view :
1863
- """Create an empty KV cache view. (use only for debugging purposes)"""
1864
- ...
1865
-
1866
-
1867
- # // Free a KV cache view. (use only for debugging purposes)
1868
- # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
1869
- @ctypes_function ("llama_kv_cache_view_free" , [llama_kv_cache_view_p ], None )
1870
- def llama_kv_cache_view_free (view : "ctypes.pointer[llama_kv_cache_view]" , / ): # type: ignore
1871
- """Free a KV cache view. (use only for debugging purposes)"""
1872
- ...
1873
-
1874
-
1875
- # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
1876
- # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
1877
- @ctypes_function (
1878
- "llama_kv_cache_view_update" , [llama_context_p_ctypes , llama_kv_cache_view_p ], None
1879
- )
1880
- def llama_kv_cache_view_update (ctx : llama_context_p , view : CtypesPointerOrRef [llama_kv_cache_view ], / ): # type: ignore
1881
- """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
1882
- ...
1883
-
1884
-
1885
1776
# // Returns the number of tokens in the KV cache (slow, use only for debug)
1886
1777
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1887
- # LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
1778
+ # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
1779
+ # "Use llama_kv_self_seq_pos_max() instead");
1888
1780
@ctypes_function (
1889
1781
"llama_kv_self_n_tokens" , [llama_context_p_ctypes ], ctypes .c_int32
1890
1782
)
1891
1783
def llama_kv_self_n_tokens (ctx : llama_context_p , / ) -> int :
1892
- """Returns the number of tokens in the KV cache (slow, use only for debug)
1893
- If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1894
1784
"""
1895
- ...
1896
-
1897
- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
1898
- # "use llama_kv_self_n_tokens instead");
1899
- @ctypes_function (
1900
- "llama_get_kv_cache_token_count" , [llama_context_p_ctypes ], ctypes .c_int32
1901
- )
1902
- def llama_get_kv_cache_token_count (ctx : llama_context_p , / ) -> int :
1785
+ DEPRECATED
1786
+ Use llama_kv_self_seq_pos_max() instead
1787
+ """
1903
1788
...
1904
1789
1905
1790
1906
1791
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
1907
- # LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
1792
+ # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
1793
+ # "Use llama_kv_self_seq_pos_max() instead");
1908
1794
@ctypes_function (
1909
1795
"llama_kv_self_used_cells" , [llama_context_p_ctypes ], ctypes .c_int32
1910
1796
)
1911
1797
def llama_kv_self_used_cells (ctx : llama_context_p , / ) -> int :
1912
- """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1798
+ """
1799
+ DEPRECATED
1800
+ Use llama_kv_self_seq_pos_max() instead
1801
+ """
1913
1802
...
1914
1803
1915
- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
1916
- # "use llama_kv_self_used_cells instead");
1917
- @ctypes_function (
1918
- "llama_get_kv_cache_used_cells" , [llama_context_p_ctypes ], ctypes .c_int32
1919
- )
1920
- def llama_get_kv_cache_used_cells (ctx : llama_context_p , / ) -> int :
1921
- ...
1922
1804
1923
1805
# // Clear the KV cache - both cell info is erased and KV data is zeroed
1924
1806
# LLAMA_API void llama_kv_self_clear(
@@ -1928,12 +1810,6 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
1928
1810
"""Clear the KV cache"""
1929
1811
...
1930
1812
1931
- # DEPRECATED(LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx),
1932
- # "use llama_kv_self_clear instead");
1933
- @ctypes_function ("llama_kv_cache_clear" , [llama_context_p_ctypes ], None )
1934
- def llama_kv_cache_clear (ctx : llama_context_p , / ):
1935
- """Clear the KV cache"""
1936
- ...
1937
1813
1938
1814
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
1939
1815
# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -1972,32 +1848,6 @@ def llama_kv_self_seq_rm(
1972
1848
...
1973
1849
1974
1850
1975
- # DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
1976
- # struct llama_context * ctx,
1977
- # llama_seq_id seq_id,
1978
- # llama_pos p0,
1979
- # llama_pos p1),
1980
- # "use llama_kv_self_seq_rm instead");
1981
- @ctypes_function (
1982
- "llama_kv_cache_seq_rm" ,
1983
- [
1984
- llama_context_p_ctypes ,
1985
- llama_seq_id ,
1986
- llama_pos ,
1987
- llama_pos ,
1988
- ],
1989
- ctypes .c_bool ,
1990
- )
1991
- def llama_kv_cache_seq_rm (
1992
- ctx : llama_context_p ,
1993
- seq_id : Union [llama_seq_id , int ],
1994
- p0 : Union [llama_pos , int ],
1995
- p1 : Union [llama_pos , int ],
1996
- / ,
1997
- ) -> bool :
1998
- ...
1999
-
2000
-
2001
1851
# // Copy all tokens that belong to the specified sequence to another sequence
2002
1852
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
2003
1853
# // p0 < 0 : [0, p1]
@@ -2033,34 +1883,6 @@ def llama_kv_self_seq_cp(
2033
1883
p1 < 0 : [p0, inf)"""
2034
1884
...
2035
1885
2036
- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
2037
- # struct llama_context * ctx,
2038
- # llama_seq_id seq_id_src,
2039
- # llama_seq_id seq_id_dst,
2040
- # llama_pos p0,
2041
- # llama_pos p1),
2042
- # "use llama_kv_self_seq_cp instead");
2043
- @ctypes_function (
2044
- "llama_kv_cache_seq_cp" ,
2045
- [
2046
- llama_context_p_ctypes ,
2047
- llama_seq_id ,
2048
- llama_seq_id ,
2049
- llama_pos ,
2050
- llama_pos ,
2051
- ],
2052
- None ,
2053
- )
2054
- def llama_kv_cache_seq_cp (
2055
- ctx : llama_context_p ,
2056
- seq_id_src : Union [llama_seq_id , int ],
2057
- seq_id_dst : Union [llama_seq_id , int ],
2058
- p0 : Union [llama_pos , int ],
2059
- p1 : Union [llama_pos , int ],
2060
- / ,
2061
- ):
2062
- ...
2063
-
2064
1886
2065
1887
# // Removes all tokens that do not belong to the specified sequence
2066
1888
# LLAMA_API void llama_kv_self_seq_keep(
@@ -2073,17 +1895,6 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
2073
1895
"""Removes all tokens that do not belong to the specified sequence"""
2074
1896
...
2075
1897
2076
- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
2077
- # struct llama_context * ctx,
2078
- # llama_seq_id seq_id),
2079
- # "use llama_kv_self_seq_keep instead");
2080
- @ctypes_function (
2081
- "llama_kv_cache_seq_keep" , [llama_context_p_ctypes , llama_seq_id ], None
2082
- )
2083
- def llama_kv_cache_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
2084
- """Removes all tokens that do not belong to the specified sequence"""
2085
- ...
2086
-
2087
1898
2088
1899
# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
2089
1900
# // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -2124,34 +1935,6 @@ def llama_kv_self_seq_add(
2124
1935
p1 < 0 : [p0, inf)"""
2125
1936
...
2126
1937
2127
- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
2128
- # struct llama_context * ctx,
2129
- # llama_seq_id seq_id,
2130
- # llama_pos p0,
2131
- # llama_pos p1,
2132
- # llama_pos delta),
2133
- # "use llama_kv_self_seq_add instead");
2134
- @ctypes_function (
2135
- "llama_kv_cache_seq_add" ,
2136
- [
2137
- llama_context_p_ctypes ,
2138
- llama_seq_id ,
2139
- llama_pos ,
2140
- llama_pos ,
2141
- llama_pos ,
2142
- ],
2143
- None ,
2144
- )
2145
- def llama_kv_cache_seq_add (
2146
- ctx : llama_context_p ,
2147
- seq_id : Union [llama_seq_id , int ],
2148
- p0 : Union [llama_pos , int ],
2149
- p1 : Union [llama_pos , int ],
2150
- delta : Union [llama_pos , int ],
2151
- / ,
2152
- ):
2153
- ...
2154
-
2155
1938
2156
1939
# // Integer division of the positions by factor of `d > 1`
2157
1940
# // If the KV cache is RoPEd, the KV data is updated accordingly
@@ -2189,35 +1972,6 @@ def llama_kv_self_seq_div(
2189
1972
...
2190
1973
2191
1974
2192
- # DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
2193
- # struct llama_context * ctx,
2194
- # llama_seq_id seq_id,
2195
- # llama_pos p0,
2196
- # llama_pos p1,
2197
- # int d),
2198
- # "use llama_kv_self_seq_div instead");
2199
- @ctypes_function (
2200
- "llama_kv_cache_seq_div" ,
2201
- [
2202
- llama_context_p_ctypes ,
2203
- llama_seq_id ,
2204
- llama_pos ,
2205
- llama_pos ,
2206
- ctypes .c_int ,
2207
- ],
2208
- None ,
2209
- )
2210
- def llama_kv_cache_seq_div (
2211
- ctx : llama_context_p ,
2212
- seq_id : Union [llama_seq_id , int ],
2213
- p0 : Union [llama_pos , int ],
2214
- p1 : Union [llama_pos , int ],
2215
- d : Union [ctypes .c_int , int ],
2216
- / ,
2217
- ):
2218
- ...
2219
-
2220
-
2221
1975
# // Returns the smallest position present in the KV cache for the specified sequence
2222
1976
# // This is typically non-zero only for SWA caches
2223
1977
# // Return -1 if the sequence is empty
@@ -2273,26 +2027,13 @@ def llama_kv_self_defrag(ctx: llama_context_p, /):
2273
2027
...
2274
2028
2275
2029
2276
- # DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
2277
- # "use llama_kv_self_defrag instead");
2278
- @ctypes_function ("llama_kv_cache_defrag" , [llama_context_p_ctypes ], None )
2279
- def llama_kv_cache_defrag (ctx : llama_context_p , / ):
2280
- ...
2281
-
2282
-
2283
2030
# // Check if the context supports KV cache shifting
2284
2031
# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
2285
2032
@ctypes_function ("llama_kv_self_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
2286
2033
def llama_kv_self_can_shift (ctx : llama_context_p , / ) -> bool :
2287
2034
"""Check if the context supports KV cache shifting"""
2288
2035
...
2289
2036
2290
- # DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
2291
- # "use llama_kv_self_can_shift instead");
2292
- @ctypes_function ("llama_kv_cache_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
2293
- def llama_kv_cache_can_shift (ctx : llama_context_p , / ) -> bool :
2294
- ...
2295
-
2296
2037
2297
2038
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
2298
2039
# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
@@ -2301,12 +2042,6 @@ def llama_kv_self_update(ctx: llama_context_p, /):
2301
2042
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
2302
2043
...
2303
2044
2304
- # DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
2305
- # "use llama_kv_self_update instead");
2306
- @ctypes_function ("llama_kv_cache_update" , [llama_context_p_ctypes ], None )
2307
- def llama_kv_cache_update (ctx : llama_context_p , / ):
2308
- ...
2309
-
2310
2045
2311
2046
# //
2312
2047
# // State / sessions
0 commit comments