Updated to latest llama.cpp binaries, this works on Windows CPU but n…

…eeds more changes for other backends
SciSharp · martindevans · Jan 21, 2025 · Dec 3, 2024 · Dec 20, 2024 · Dec 27, 2024
commit c90ddd989a47eb0c36f5e49c8d9f415b7681056d
diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
@@ -1,6 +1,5 @@
 using LLama.Native;
 using Spectre.Console;
-using System.Runtime.InteropServices;
 
 AnsiConsole.MarkupLineInterpolated(
     $"""
@@ -30,9 +29,8 @@ __       __                                       ____     __
 // Configure native library to use. This must be done before any other llama.cpp methods are called!
 NativeLibraryConfig
    .All
-   .WithCuda(false)
-   .WithVulkan(false)
-   .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
+   .WithCuda()
+   .WithVulkan();
 
 // Calling this method forces loading to occur now.
 NativeApi.llama_empty_call();

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -24,7 +24,7 @@ public class ModelOptions
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -59,6 +59,9 @@ public class ModelOptions
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; } = new();
 

diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -36,7 +36,7 @@ public interface IModelParams
         /// <summary>
         /// How to split the model across multiple GPUs
         /// </summary>
-        GPUSplitMode SplitMode { get; }
+        GPUSplitMode? SplitMode { get; }
 
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
@@ -68,6 +68,11 @@ public interface IModelParams
         /// </summary>
         bool VocabOnly { get; }
 
+        /// <summary>
+        /// Validate model tensor data before loading
+        /// </summary>
+        bool CheckTensors { get; }
+
         /// <summary>
         /// Override specific metadata items in the model
         /// </summary>

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -19,7 +19,7 @@ public record ModelParams
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -54,6 +54,9 @@ public record ModelParams
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; set; } = new();
 

diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -30,13 +30,15 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
 
         result = LLamaModelParams.Default();
 
-        result.devices = IntPtr.Zero;
         result.main_gpu = @params.MainGpu;
-        result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
+        if (@params.SplitMode.HasValue)
+            result.split_mode = @params.SplitMode.Value;
+
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;
         result.vocab_only = @params.VocabOnly;
+        result.check_tensors = @params.CheckTensors;
 
         unsafe
         {

diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs
@@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
                     PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
                     PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
                     RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
-                    RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
+                    PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
                     Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
                     MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
                     MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,

diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
@@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
                 case LLamaFtype.MOSTLY_IQ3_S:
                 case LLamaFtype.MOSTLY_IQ3_M:
 
-                case LLamaFtype.MOSTLY_Q4_0_4_4:
-                case LLamaFtype.MOSTLY_Q4_0_4_8:
-                case LLamaFtype.MOSTLY_Q4_0_8_8:
                     return true;
 
                 case LLamaFtype.GUESSED:

diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
@@ -12,6 +12,15 @@
           <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
           <Link>runtimes/win-x64/native/noavx/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-base.dll">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/win-x64/native/noavx/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/noavx/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx/llama.dll</Link>
@@ -20,22 +29,49 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-cpu.dll">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/win-x64/native/avx/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx2/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx2/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx2/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx2/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx512/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx512/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx512/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx512/ggml-cpu.dll</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda11/llama.dll</Link>

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -56,7 +56,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>c9b00a70b080d</BinaryReleaseId>
+    <BinaryReleaseId>d79d8f39b4da6</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>

diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
@@ -23,14 +23,14 @@ public class StatelessExecutor
         private readonly ILogger? _logger;
         private readonly LLamaBatch _batch;
 
-        // LLava Section
+        /// <inheritdoc />
         public bool IsMultiModal => false;
 
         /// <inheritdoc />
-        public LLavaWeights? ClipModel { get;  }
+        public LLavaWeights? ClipModel => default;
 
         /// <inheritdoc />
-        public List<byte[]> Images { get; set; }
+        public List<byte[]> Images { get; }
 
         /// <summary>
         /// The context used by the executor when running the inference.
@@ -68,7 +68,7 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
             Context = context;
 
             // Reset the sampling pipeline (if there is one)
-            inferenceParams?.SamplingPipeline?.Reset();
+            inferenceParams?.SamplingPipeline.Reset();
 
             // Sanity check inference params
             inferenceParams ??= new InferenceParams();
@@ -134,8 +134,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                     var n_left = n_past - tokensKeep;
                     var n_discard = n_left / 2;
 
-                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
-                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
+                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
+                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
                 }

diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -10,9 +10,9 @@ public unsafe struct LLamaModelParams
     {
         /// <summary>
         /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+        /// todo: add support for llama_model_params.devices
         /// </summary>
-        //ggml_backend_dev_t* devices;
-        public IntPtr devices;
+        private IntPtr devices;
 
         /// <summary>
         /// // number of layers to store in VRAM
@@ -32,12 +32,12 @@ public unsafe struct LLamaModelParams
         /// <summary>
         /// how to split layers across multiple GPUs (size: <see cref="NativeApi.llama_max_devices"/>)
         /// </summary>
-        public float* tensor_split;
-
-        /// <summary>
-        /// comma separated list of RPC servers to use for offloading
+        public float* tensor_split;
+
+        /// <summary>
+        /// comma separated list of RPC servers to use for offloading
         /// </summary>
-        public byte* rpc_servers;
+        public byte* rpc_servers;
 
         /// <summary>
         /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback

diff --git a/LLama/Native/Load/NativeLibraryConfig.cs b/LLama/Native/Load/NativeLibraryConfig.cs
@@ -178,15 +178,15 @@ internal Description CheckAndGatherDescription()
                 _avxLevel,
                 _allowFallback,
                 _skipCheck,
-                _searchDirectories.Concat(new[] { "./" }).ToArray()
+                _searchDirectories.Concat([ "./" ]).ToArray()
             );
         }
 
         internal static string AvxLevelToString(AvxLevel level)
         {
             return level switch
             {
-                AvxLevel.None => string.Empty,
+                AvxLevel.None => "noavx",
                 AvxLevel.Avx => "avx",
                 AvxLevel.Avx2 => "avx2",
                 AvxLevel.Avx512 => "avx512",

diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs
@@ -20,44 +20,6 @@ public sealed class DefaultSamplingPipeline
     /// </summary>
     public float RepeatPenalty { get; init; } = 1;
 
-    /// <summary>
-    /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
-    /// so far, decreasing the model's likelihood to repeat the same line verbatim.
-    /// </summary>
-    [Obsolete($"Use {nameof(FrequencyPenalty)} instead.")]
-    public float AlphaFrequency
-    {
-        get => _frequencyPenalty;
-        init
-        {
-            if (value < -2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be greater than -2");
-            if (value > 2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be less than 2");
-            _frequencyPenalty = value;
-        }
-    }
-
-    /// <summary>
-    /// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
-    /// text so far, increasing the model's likelihood to talk about new topics.
-    /// </summary>
-    [Obsolete($"Use {nameof(PresencePenalty)} instead.")]
-    public float AlphaPresence
-    {
-        get => _presencePenalty;
-        init
-        {
-            if (value < -2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be greater than -2");
-            if (value > 2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be less than 2");
-            _presencePenalty = value;
-        }
-    }
-
     /// <summary>
     /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
     /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
@@ -97,21 +59,15 @@ public float PresencePenalty
     private readonly float _presencePenalty;
 
     /// <summary>
-    /// How many tokens should be considered for penalizing repetition
+    /// How many tokens should be considered for penalties
     /// </summary>
-    public int RepeatPenaltyCount { get; init; } = 64;
+    public int PenaltyCount { get; init; } = 64;
 
     /// <summary>
     /// Whether the newline token should be protected from being modified by penalty
     /// </summary>
     public bool PenalizeNewline { get; init; } = false;
 
-    /// <summary>
-    /// Whether the EOS token should be protected from being modified by penalty
-    /// </summary>
-    [Obsolete($"This doesn't do what the name implies. If you're sure you want to use it, use {nameof(PreventEOS)}.")]
-    public bool PenalizeEOS { get; init; } = false;
-
     /// <summary>
     /// Whether the EOS token should be suppressed. Setting this to 'true' prevents EOS from being sampled
     /// </summary>
@@ -196,13 +152,7 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
         if (Grammar != null)
             chain.AddGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root);
 
-        chain.AddPenalties(
-            context.VocabCount,
-            context.ModelHandle.Tokens.EOS, context.ModelHandle.Tokens.Newline ?? 0,
-            RepeatPenaltyCount, RepeatPenalty,
-            FrequencyPenalty, PresencePenalty,
-            PenalizeNewline, PreventEOS
-        );
+        chain.AddPenalties(PenaltyCount, RepeatPenalty, FrequencyPenalty, PresencePenalty);
 
         chain.AddTopK(TopK);
         chain.AddTypical(TypicalP, MinKeep);

diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -18,13 +18,20 @@
   <files>
     <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cpu.props" />
 
-    <file src="runtimes/deps/ggml.dll" target="runtimes\win-x64\native\ggml.dll" />
+    <file src="runtimes/deps/ggml-base.dll" target="runtimes\win-x64\native\ggml-base.dll" />
+    <file src="runtimes/deps/ggml-cpu.dll" target="runtimes\win-x64\native\ggml-cpu.dll" />
     <file src="runtimes/deps/llama.dll" target="runtimes\win-x64\native\llama.dll" />
-    <file src="runtimes/deps/avx/ggml.dll" target="runtimes\win-x64\native\avx\ggml.dll" />
+
+    <file src="runtimes/deps/avx/ggml-base.dll" target="runtimes\win-x64\native\avx\ggml-base.dll" />
+    <file src="runtimes/deps/avx/ggml-cpu.dll" target="runtimes\win-x64\native\avx\ggml-cpu.dll" />
     <file src="runtimes/deps/avx/llama.dll" target="runtimes\win-x64\native\avx\llama.dll" />
-    <file src="runtimes/deps/avx2/ggml.dll" target="runtimes\win-x64\native\avx2\ggml.dll" />
+
+    <file src="runtimes/deps/avx2/ggml-base.dll" target="runtimes\win-x64\native\avx2\ggml-base.dll" />
+    <file src="runtimes/deps/avx2/ggml-cpu.dll" target="runtimes\win-x64\native\avx2\ggml-cpu.dll" />
     <file src="runtimes/deps/avx2/llama.dll" target="runtimes\win-x64\native\avx2\llama.dll" />
-    <file src="runtimes/deps/avx512/ggml.dll" target="runtimes\win-x64\native\avx512\ggml.dll" />
+
+    <file src="runtimes/deps/avx512/ggml-base.dll" target="runtimes\win-x64\native\avx512\ggml-base.dll" />
+    <file src="runtimes/deps/avx512/ggml-cpu.dll" target="runtimes\win-x64\native\avx512\ggml-cpu.dll" />
     <file src="runtimes/deps/avx512/llama.dll" target="runtimes\win-x64\native\avx512\llama.dll" />
 
     <file src="runtimes/deps/libggml.so" target="runtimes\linux-x64\native\libggml.so" />