@@ -1755,18 +1755,6 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
1755
1755
...
1756
1756
1757
1757
1758
- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
1759
- # "use llama_kv_self_n_tokens instead");
1760
- @ctypes_function (
1761
- "llama_get_kv_cache_token_count" , [llama_context_p_ctypes ], ctypes .c_int32
1762
- )
1763
- def llama_get_kv_cache_token_count (ctx : llama_context_p , / ) -> int :
1764
- """Returns the number of tokens in the KV cache (slow, use only for debug)
1765
- If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1766
- """
1767
- ...
1768
-
1769
-
1770
1758
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
1771
1759
# LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
1772
1760
@ctypes_function (
@@ -1777,16 +1765,6 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
1777
1765
...
1778
1766
1779
1767
1780
- # DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
1781
- # "use llama_kv_self_used_cells instead");
1782
- @ctypes_function (
1783
- "llama_get_kv_cache_used_cells" , [llama_context_p_ctypes ], ctypes .c_int32
1784
- )
1785
- def llama_get_kv_cache_used_cells (ctx : llama_context_p , / ) -> int :
1786
- """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1787
- ...
1788
-
1789
-
1790
1768
# // Clear the KV cache - both cell info is erased and KV data is zeroed
1791
1769
# LLAMA_API void llama_kv_self_clear(
1792
1770
# struct llama_context * ctx);
@@ -1797,49 +1775,6 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
1797
1775
"""Clear the KV cache - both cell info is erased and KV data is zeroed"""
1798
1776
...
1799
1777
1800
- # NOTE: Deprecated
1801
- @ctypes_function ("llama_kv_self_clear" , [llama_context_p_ctypes ], None )
1802
- def llama_kv_cache_clear (ctx : llama_context_p , / ):
1803
- """Clear the KV cache"""
1804
- ...
1805
-
1806
-
1807
- # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
1808
- # // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
1809
- # // seq_id < 0 : match any sequence
1810
- # // p0 < 0 : [0, p1]
1811
- # // p1 < 0 : [p0, inf)
1812
- # LLAMA_API bool llama_kv_cache_seq_rm(
1813
- # struct llama_context * ctx,
1814
- # llama_seq_id seq_id,
1815
- # llama_pos p0,
1816
- # llama_pos p1);
1817
- @ctypes_function (
1818
- "llama_kv_cache_seq_rm" ,
1819
- [
1820
- llama_context_p_ctypes ,
1821
- llama_seq_id ,
1822
- llama_pos ,
1823
- llama_pos ,
1824
- ],
1825
- ctypes .c_bool ,
1826
- )
1827
- def llama_kv_cache_seq_rm (
1828
- ctx : llama_context_p ,
1829
- seq_id : Union [llama_seq_id , int ],
1830
- p0 : Union [llama_pos , int ],
1831
- p1 : Union [llama_pos , int ],
1832
- / ,
1833
- ) -> bool :
1834
- """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
1835
-
1836
- Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
1837
-
1838
- seq_id < 0 : match any sequence
1839
- p0 < 0 : [0, p1]
1840
- p1 < 0 : [p0, inf)"""
1841
- ...
1842
-
1843
1778
1844
1779
# // Copy all tokens that belong to the specified sequence to another sequence
1845
1780
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
@@ -1877,33 +1812,6 @@ def llama_kv_self_seq_cp(
1877
1812
...
1878
1813
1879
1814
1880
- # NOTE: Deprecated
1881
- @ctypes_function (
1882
- "llama_kv_self_seq_cp" ,
1883
- [
1884
- llama_context_p_ctypes ,
1885
- llama_seq_id ,
1886
- llama_seq_id ,
1887
- llama_pos ,
1888
- llama_pos ,
1889
- ],
1890
- None ,
1891
- )
1892
- def llama_kv_cache_seq_cp (
1893
- ctx : llama_context_p ,
1894
- seq_id_src : Union [llama_seq_id , int ],
1895
- seq_id_dst : Union [llama_seq_id , int ],
1896
- p0 : Union [llama_pos , int ],
1897
- p1 : Union [llama_pos , int ],
1898
- / ,
1899
- ):
1900
- """Copy all tokens that belong to the specified sequence to another sequence
1901
- Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
1902
- p0 < 0 : [0, p1]
1903
- p1 < 0 : [p0, inf)"""
1904
- ...
1905
-
1906
-
1907
1815
# // Removes all tokens that do not belong to the specified sequence
1908
1816
# LLAMA_API void llama_kv_self_seq_keep(
1909
1817
# struct llama_context * ctx,
@@ -1916,13 +1824,6 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
1916
1824
...
1917
1825
1918
1826
1919
- # NOTE: Deprecated
1920
- @ctypes_function (
1921
- "llama_kv_self_seq_keep" , [llama_context_p_ctypes , llama_seq_id ], None
1922
- )
1923
- def llama_kv_cache_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
1924
- """Removes all tokens that do not belong to the specified sequence"""
1925
- ...
1926
1827
1927
1828
1928
1829
@@ -1964,49 +1865,6 @@ def llama_kv_self_seq_add(
1964
1865
p0 < 0 : [0, p1]
1965
1866
p1 < 0 : [p0, inf)"""
1966
1867
...
1967
-
1968
-
1969
- # // NOTE: Deprecated
1970
- # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
1971
- # // If the KV cache is RoPEd, the KV data is updated accordingly:
1972
- # // - lazily on next llama_decode()
1973
- # // - explicitly with llama_kv_cache_update()
1974
- # // p0 < 0 : [0, p1]
1975
- # // p1 < 0 : [p0, inf)
1976
- # LLAMA_API void llama_kv_cache_seq_add(
1977
- # struct llama_context * ctx,
1978
- # llama_seq_id seq_id,
1979
- # llama_pos p0,
1980
- # llama_pos p1,
1981
- # llama_pos delta);
1982
- @ctypes_function (
1983
- "llama_kv_self_seq_add" ,
1984
- [
1985
- llama_context_p_ctypes ,
1986
- llama_seq_id ,
1987
- llama_pos ,
1988
- llama_pos ,
1989
- llama_pos ,
1990
- ],
1991
- None ,
1992
- )
1993
- def llama_kv_cache_seq_add (
1994
- ctx : llama_context_p ,
1995
- seq_id : Union [llama_seq_id , int ],
1996
- p0 : Union [llama_pos , int ],
1997
- p1 : Union [llama_pos , int ],
1998
- delta : Union [llama_pos , int ],
1999
- / ,
2000
- ):
2001
- """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
2002
- If the KV cache is RoPEd, the KV data is updated accordingly:
2003
- - lazily on next llama_decode()
2004
- - explicitly with llama_kv_cache_update()
2005
- p0 < 0 : [0, p1]
2006
- p1 < 0 : [p0, inf)"""
2007
- ...
2008
-
2009
-
2010
1868
# // Integer division of the positions by factor of `d > 1`
2011
1869
# // If the KV cache is RoPEd, the KV data is updated accordingly
2012
1870
# // p0 < 0 : [0, p1]
@@ -2043,43 +1901,6 @@ def llama_kv_self_seq_div(
2043
1901
...
2044
1902
2045
1903
2046
- # // NOTE: Deprecated
2047
- # // Integer division of the positions by factor of `d > 1`
2048
- # // If the KV cache is RoPEd, the KV data is updated accordingly
2049
- # // p0 < 0 : [0, p1]
2050
- # // p1 < 0 : [p0, inf)
2051
- # LLAMA_API void llama_kv_cache_seq_div(
2052
- # struct llama_context * ctx,
2053
- # llama_seq_id seq_id,
2054
- # llama_pos p0,
2055
- # llama_pos p1,
2056
- # int d);
2057
- @ctypes_function (
2058
- "llama_kv_self_seq_div" ,
2059
- [
2060
- llama_context_p_ctypes ,
2061
- llama_seq_id ,
2062
- llama_pos ,
2063
- llama_pos ,
2064
- ctypes .c_int ,
2065
- ],
2066
- None ,
2067
- )
2068
- def llama_kv_cache_seq_div (
2069
- ctx : llama_context_p ,
2070
- seq_id : Union [llama_seq_id , int ],
2071
- p0 : Union [llama_pos , int ],
2072
- p1 : Union [llama_pos , int ],
2073
- d : Union [ctypes .c_int , int ],
2074
- / ,
2075
- ):
2076
- """Integer division of the positions by factor of `d > 1`
2077
- If the KV cache is RoPEd, the KV data is updated accordingly
2078
- p0 < 0 : [0, p1]
2079
- p1 < 0 : [p0, inf)"""
2080
- ...
2081
-
2082
-
2083
1904
# // Returns the largest position present in the KV cache for the specified sequence
2084
1905
# LLAMA_API llama_pos llama_kv_self_seq_pos_max(
2085
1906
# struct llama_context * ctx,
@@ -2108,21 +1929,6 @@ def llama_kv_self_defrag(ctx: llama_context_p, /):
2108
1929
...
2109
1930
2110
1931
2111
- # NOTE: Deprecated
2112
- # // Defragment the KV cache
2113
- # // This will be applied:
2114
- # // - lazily on next llama_decode()
2115
- # // - explicitly with llama_kv_self_update()
2116
- # LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
2117
- @ctypes_function ("llama_kv_cache_defrag" , [llama_context_p_ctypes ], None )
2118
- def llama_kv_cache_defrag (ctx : llama_context_p , / ):
2119
- """Defragment the KV cache
2120
- This will be applied:
2121
- - lazily on next llama_decode()
2122
- - explicitly with llama_kv_cache_update()"""
2123
- ...
2124
-
2125
-
2126
1932
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
2127
1933
# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
2128
1934
@ctypes_function ("llama_kv_self_update" , [llama_context_p_ctypes ], None )
@@ -2147,15 +1953,6 @@ def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
2147
1953
...
2148
1954
2149
1955
2150
- # // NOTE: Deprecated
2151
- # // Check if the context supports KV cache shifting
2152
- # LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
2153
- @ctypes_function ("llama_kv_self_can_shift" , [llama_context_p_ctypes ], ctypes .c_bool )
2154
- def llama_kv_cache_can_shift (ctx : llama_context_p , / ) -> bool :
2155
- """Check if the context supports KV cache shifting"""
2156
- ...
2157
-
2158
-
2159
1956
# //
2160
1957
# // State / sessions
2161
1958
# //
0 commit comments