April 2024 Binary Update (#662)

* Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
SciSharp · Apr 16, 2024 · c325ac9 · c325ac9 · zsogitbe · Apr 18, 2024
1 parent 399e81d
commit c325ac9
Show file tree

Hide file tree

Showing 81 changed files with 1,709 additions and 1,620 deletions.
diff --git a/LLama.Examples/Examples/BatchedExecutorGuidance.cs b/LLama.Examples/Examples/BatchedExecutorGuidance.cs
@@ -79,7 +79,7 @@ await AnsiConsole
                     guidance.Prompt(g);
 
                     // Early exit if we reach the natural end of the guided sentence
-                    if (g == model.EndOfSentenceToken)
+                    if (g == model.Tokens.EOS)
                         break;
 
                     // Update progress bar

diff --git a/LLama.Examples/Examples/GetEmbeddings.cs b/LLama.Examples/Examples/GetEmbeddings.cs
@@ -9,7 +9,7 @@ public static void Run()
             string modelPath = UserSettings.GetModelPath();
 
             Console.ForegroundColor = ConsoleColor.DarkGray;
-            var @params = new ModelParams(modelPath) { EmbeddingMode = true };
+            var @params = new ModelParams(modelPath) { Embeddings = true };
             using var weights = LLamaWeights.LoadFromFile(@params);
             var embedder = new LLamaEmbedder(weights, @params);
 

diff --git a/LLama.Examples/Examples/SemanticKernelMemory.cs b/LLama.Examples/Examples/SemanticKernelMemory.cs
@@ -20,7 +20,7 @@ public static async Task Run()
             var parameters = new ModelParams(modelPath)
             {
                 Seed = seed,
-                EmbeddingMode = true
+                Embeddings = true
             };
 
             using var model = LLamaWeights.LoadFromFile(parameters);

diff --git a/LLama.KernelMemory/BuilderExtensions.cs b/LLama.KernelMemory/BuilderExtensions.cs
@@ -84,7 +84,7 @@ public static IKernelMemoryBuilder WithLLamaSharpDefaults(this IKernelMemoryBuil
                 ContextSize = config?.ContextSize ?? 2048,
                 Seed = config?.Seed ?? 0,
                 GpuLayerCount = config?.GpuLayerCount ?? 20,
-                EmbeddingMode = true,
+                Embeddings = true,
                 MainGpu = config?.MainGpu ?? 0,
                 SplitMode = config?.SplitMode ?? GPUSplitMode.None,
             };

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -29,7 +29,7 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
             this._config = config;
             var @params = new ModelParams(_config.ModelPath)
             {
-                EmbeddingMode = true,
+                Embeddings = true,
                 MainGpu = _config.MainGpu,
                 SplitMode = _config.SplitMode
             };
@@ -49,7 +49,7 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we
             this._config = config;
             var @params = new ModelParams(_config.ModelPath)
             {
-                EmbeddingMode = true,
+                Embeddings = true,
                 MainGpu = _config.MainGpu,
                 SplitMode = _config.SplitMode
             };

diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
@@ -15,7 +15,7 @@ public sealed class BasicTest
         public BasicTest(ITestOutputHelper testOutputHelper)
         {
             _testOutputHelper = testOutputHelper;
-            _params = new ModelParams(Constants.ModelPath)
+            _params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 2048
             };

diff --git a/LLama.Unittest/BeamTests.cs b/LLama.Unittest/BeamTests.cs
@@ -15,7 +15,7 @@ public sealed class BeamTests
     public BeamTests(ITestOutputHelper testOutputHelper)
     {
         _testOutputHelper = testOutputHelper;
-        _params = new ModelParams(Constants.ModelPath)
+        _params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048
         };

diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs
@@ -2,9 +2,11 @@
 {
     internal static class Constants
     {
-        public static string ModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
-        public static string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
-        public static string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
-        public static string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
+        public static readonly string GenerativeModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
+        public static readonly string EmbeddingModelPath = "Models/all-MiniLM-L12-v2.Q8_0.gguf";
+
+        public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
+        public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
+        public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
     }
 }
diff --git a/LLama.Unittest/GrammarTest.cs b/LLama.Unittest/GrammarTest.cs
@@ -12,7 +12,7 @@ public sealed class GrammarTest
 
         public GrammarTest()
         {
-            _params = new ModelParams(Constants.ModelPath)
+            _params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 2048,
                 Seed = 92,

diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj
@@ -31,6 +31,9 @@
     <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true"></DownloadFile>
     <DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf" DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf" SkipUnchangedFiles="true"></DownloadFile>
     <DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf" DestinationFolder="Models" DestinationFileName="mmproj-model-f16.gguf" SkipUnchangedFiles="true"></DownloadFile>
+    <DownloadFile SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf" DestinationFolder="Models" DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf" SkipUnchangedFiles="true"></DownloadFile>
+
+
   </Target>
 
   <ItemGroup>
@@ -43,6 +46,9 @@
   </ItemGroup>
 
   <ItemGroup>
+    <None Update="Models\all-MiniLM-L12-v2.Q8_0.gguf">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
     <None Update="Models\llama-2-7b-chat.Q3_K_S.gguf">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </None>

diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
@@ -11,7 +11,7 @@ public sealed class LLamaContextTests
 
         public LLamaContextTests()
         {
-            var @params = new ModelParams(Constants.ModelPath)
+            var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 768,
             };

diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -1,5 +1,7 @@
 using LLama.Common;
+using LLama.Native;
 using Xunit.Abstractions;
+using Xunit.Sdk;
 
 namespace LLama.Unittest;
 
@@ -12,11 +14,11 @@ public sealed class LLamaEmbedderTests
     public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
     {
         _testOutputHelper = testOutputHelper;
-        var @params = new ModelParams(Constants.ModelPath)
+        var @params = new ModelParams(Constants.EmbeddingModelPath)
         {
             ContextSize = 4096,
             Threads = 5,
-            EmbeddingMode = true,
+            Embeddings = true,
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         _embedder = new(weights, @params);
@@ -38,15 +40,25 @@ private static float Dot(float[] a, float[] b)
     public async Task EmbedCompare()
     {
         var cat = await _embedder.GetEmbeddings("The cat is cute");
+        Assert.DoesNotContain(float.NaN, cat);
+
         var kitten = await _embedder.GetEmbeddings("The kitten is kawaii");
+        Assert.DoesNotContain(float.NaN, kitten);
+
         var spoon = await _embedder.GetEmbeddings("The spoon is not real");
+        Assert.DoesNotContain(float.NaN, spoon);
 
         _testOutputHelper.WriteLine($"Cat    = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]");
         _testOutputHelper.WriteLine($"Kitten = [{string.Join(",", kitten.AsMemory().Slice(0, 7).ToArray())}...]");
         _testOutputHelper.WriteLine($"Spoon  = [{string.Join(",", spoon.AsMemory().Slice(0, 7).ToArray())}...]");
 
         var close = 1 - Dot(cat, kitten);
         var far = 1 - Dot(cat, spoon);
+
+        _testOutputHelper.WriteLine("");
+        _testOutputHelper.WriteLine($"Cat.Kitten (Close): {close:F4}");
+        _testOutputHelper.WriteLine($"Cat.Spoon  (Far):   {far:F4}");
+
         Assert.True(close < far);
     }
 }
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
@@ -14,7 +14,7 @@ public sealed class LLavaWeightTests
 
         public LLavaWeightTests()
         {
-            var @params = new ModelParams(Constants.ModelPath)
+            var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 // Llava models requires big context
                 ContextSize = 4096

diff --git a/LLama.Unittest/MemoryDisposalTests.cs b/LLama.Unittest/MemoryDisposalTests.cs
@@ -7,7 +7,7 @@ public class MemoryDisposalTests
     [Fact]
     public void ModelDisposal()
     {
-        var @params = new ModelParams(Constants.ModelPath)
+        var @params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048
         };
@@ -21,7 +21,7 @@ public void ModelDisposal()
     [Fact]
     public void ContextDisposal()
     {
-        var @params = new ModelParams(Constants.ModelPath)
+        var @params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048
         };

diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs
@@ -15,7 +15,7 @@ public class StatelessExecutorTest
         public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
         {
             _testOutputHelper = testOutputHelper;
-            _params = new ModelParams(Constants.ModelPath)
+            _params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 60,
                 Seed = 1754,

diff --git a/LLama.Unittest/StreamingTextDecoderTests.cs b/LLama.Unittest/StreamingTextDecoderTests.cs
@@ -14,7 +14,7 @@ public class StreamingTextDecoderTests
     public StreamingTextDecoderTests(ITestOutputHelper testOutputHelper)
     {
         _testOutputHelper = testOutputHelper;
-        _params = new ModelParams(Constants.ModelPath);
+        _params = new ModelParams(Constants.GenerativeModelPath);
         _model = LLamaWeights.LoadFromFile(_params);
     }
 

diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs
@@ -12,7 +12,7 @@ public sealed class TokenTests
 
     public TokenTests()
     {
-        _params = new ModelParams(Constants.ModelPath)
+        _params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048
         };

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -29,9 +29,13 @@ public class ModelOptions
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
+        public uint SeqMax { get; }
+
         /// <inheritdoc />
         public uint Seed { get; set; } = 1686349486;
 
+        public bool Embeddings { get; }
+
         /// <inheritdoc />
         public bool UseMemorymap { get; set; } = true;
 
@@ -57,7 +61,7 @@ public class ModelOptions
         public uint BatchSize { get; set; } = 512;
 
         /// <inheritdoc />
-        public bool EmbeddingMode { get; set; } = false;
+        public uint UBatchSize { get; set; } = 512;
 
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
@@ -108,6 +112,6 @@ public class ModelOptions
         public float DefragThreshold { get; set; }
 
         /// <inheritdoc />
-        public bool DoPooling { get; set; }
+        public LLamaPoolingType PoolingType { get; set; }
     }
 }
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -14,20 +14,29 @@ public interface IContextParams
     uint? ContextSize { get; }
 
     /// <summary>
-    /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+    /// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
     /// </summary>
     uint BatchSize { get; }
 
+    /// <summary>
+    /// Physical batch size
+    /// </summary>
+    uint UBatchSize { get; }
+
+    /// <summary>
+    /// max number of sequences (i.e. distinct states for recurrent models)
+    /// </summary>
+    uint SeqMax { get; }
+
     /// <summary>
     /// Seed for the random number generator (seed)
     /// </summary>
     uint Seed { get; }
 
     /// <summary>
-    /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
-    /// The LLamaModel won't produce text response anymore.
+    /// If true, extract embeddings (together with logits).
     /// </summary>
-    bool EmbeddingMode { get; }
+    bool Embeddings { get; }
 
     /// <summary>
     /// RoPE base frequency (null to fetch from the model)
@@ -105,7 +114,7 @@ public interface IContextParams
     float DefragThreshold { get; }
 
     /// <summary>
-    /// Whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+    /// How to pool (sum) embedding results by sequence id (ignored if no pooling layer)
     /// </summary>
-    bool DoPooling { get; }
+    LLamaPoolingType PoolingType { get; }
 }
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -24,6 +24,9 @@ public record ModelParams
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
+        /// <inheritdoc />
+        public uint SeqMax { get; set; } = 1;
+
         /// <inheritdoc />
         public uint Seed { get; set; } = 0xFFFFFFFF;
 
@@ -52,7 +55,10 @@ public record ModelParams
         public uint BatchSize { get; set; } = 512;
 
         /// <inheritdoc />
-        public bool EmbeddingMode { get; set; }
+        public uint UBatchSize { get; set; } = 512;
+
+        /// <inheritdoc />
+        public bool Embeddings { get; set; }
 
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
@@ -97,7 +103,7 @@ public record ModelParams
         public float DefragThreshold { get; set; }
 
         /// <inheritdoc />
-        public bool DoPooling { get; set; }
+        public LLamaPoolingType PoolingType { get; set; } = LLamaPoolingType.Unspecified;
 
         /// <inheritdoc />
         public bool VocabOnly { get; set; }

diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -20,11 +20,14 @@ public static class IContextParamsExtensions
         /// <exception cref="ArgumentException"></exception>
         public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result)
         {
-            result = NativeApi.llama_context_default_params();
+            result = LLamaContextParams.Default();
+
             result.n_ctx = @params.ContextSize ?? 0;
             result.n_batch = @params.BatchSize;
+            result.n_ubatch = @params.UBatchSize;
+            result.n_seq_max = @params.SeqMax;
             result.seed = @params.Seed;
-            result.embedding = @params.EmbeddingMode;
+            result.embeddings = @params.Embeddings;
             result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
             result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;
 
@@ -41,10 +44,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.cb_eval = IntPtr.Zero;
             result.cb_eval_user_data = IntPtr.Zero;
 
+            result.abort_callback = IntPtr.Zero;
+            result.abort_callback_user_data = IntPtr.Zero;
+
             result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
             result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
             result.offload_kqv = !@params.NoKqvOffload;
-            result.do_pooling = @params.DoPooling;
+            result.llama_pooling_type = @params.PoolingType;
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);

diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -28,7 +28,8 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
 
         var disposer = new GroupDisposable();
 
-        result = NativeApi.llama_model_default_params();
+        result = LLamaModelParams.Default();
+
         result.main_gpu = @params.MainGpu;
         result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;