Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

January 2025 Update #1036

Merged
merged 25 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ac29c34
code changes for december update (not working yet)
martindevans Dec 3, 2024
e4f4fed
Changes to support up to https://github.com/ggerganov/llama.cpp/commi…
martindevans Dec 20, 2024
c90ddd9
Updated to latest llama.cpp binaries, this works on Windows CPU but n…
martindevans Dec 27, 2024
c27cfde
Updated to latest deps, fixed kernel memory failing to load
martindevans Jan 4, 2025
a5c9759
Copy missing Mac flibraries libggml-base and libggml-cpu
SignalRT Jan 4, 2025
34198f9
Removed any mention of AVX in MacOS loading
martindevans Jan 4, 2025
3d93174
Added file copying for some more targets (still missing macos)
martindevans Jan 11, 2025
0647df9
Updated to latest set of binaries
martindevans Jan 11, 2025
756a88f
Fixed copy path for CUDA12 DLLs
martindevans Jan 11, 2025
4950e0d
Compatibility with llama.cpp backend split (PR #10256) on all platforms
m0nsky Jan 17, 2025
40a8c6c
Restore original comment
m0nsky Jan 17, 2025
3521b27
Merge pull request #5 from m0nsky/wip_december_update_fixes
martindevans Jan 17, 2025
dc3dff1
Update the dependency loader for ggml-metal and ggml-blas
m0nsky Jan 18, 2025
7b558ce
Update the runtime targets for ggml-metal and ggml-blas
m0nsky Jan 18, 2025
6d0b421
Add CPU backend (fallback) dependency for the GPU backends
m0nsky Jan 18, 2025
4dbdc82
Fix icons for the nuget backends
m0nsky Jan 18, 2025
556a7c1
Update nuspec files for the GPU backends
m0nsky Jan 19, 2025
f526cbe
Update BinaryReleaseId
m0nsky Jan 19, 2025
91effe9
Update nuspec for CPU & OSX
m0nsky Jan 19, 2025
695a4da
Merge pull request #6 from m0nsky/wip_december_update_fixes_v2
martindevans Jan 19, 2025
3be20b1
Update CPU nuspec to use noavx folder
m0nsky Jan 19, 2025
686627c
Update Runtime.targets to use noavx folder
m0nsky Jan 19, 2025
1913966
Update BinaryReleaseId
m0nsky Jan 19, 2025
014ef78
CUDA & Vulkan native libraries now correctly store the detected or us…
m0nsky Jan 20, 2025
830a078
Merge pull request #7 from m0nsky/wip_december_update_fixes_v3
martindevans Jan 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updated to latest llama.cpp binaries, this works on Windows CPU but n…
…eeds more changes for other backends
  • Loading branch information
martindevans committed Dec 27, 2024
commit c90ddd989a47eb0c36f5e49c8d9f415b7681056d
6 changes: 2 additions & 4 deletions LLama.Examples/Program.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using LLama.Native;
using Spectre.Console;
using System.Runtime.InteropServices;

AnsiConsole.MarkupLineInterpolated(
$"""
Expand Down Expand Up @@ -30,9 +29,8 @@ __ __ ____ __
// Configure native library to use. This must be done before any other llama.cpp methods are called!
NativeLibraryConfig
.All
.WithCuda(false)
.WithVulkan(false)
.DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
.WithCuda()
.WithVulkan();

// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();
Expand Down
5 changes: 4 additions & 1 deletion LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class ModelOptions
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
public GPUSplitMode? SplitMode { get; set; }

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
Expand Down Expand Up @@ -59,6 +59,9 @@ public class ModelOptions
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <inheritdoc />
public bool CheckTensors { get; }

/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; } = new();

Expand Down
7 changes: 6 additions & 1 deletion LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public interface IModelParams
/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; }
GPUSplitMode? SplitMode { get; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
Expand Down Expand Up @@ -68,6 +68,11 @@ public interface IModelParams
/// </summary>
bool VocabOnly { get; }

/// <summary>
/// Validate model tensor data before loading
/// </summary>
bool CheckTensors { get; }

/// <summary>
/// Override specific metadata items in the model
/// </summary>
Expand Down
5 changes: 4 additions & 1 deletion LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public record ModelParams
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
public GPUSplitMode? SplitMode { get; set; }

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
Expand Down Expand Up @@ -54,6 +54,9 @@ public record ModelParams
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <inheritdoc />
public bool CheckTensors { get; }

/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; set; } = new();

Expand Down
6 changes: 4 additions & 2 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam

result = LLamaModelParams.Default();

result.devices = IntPtr.Zero;
result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
if (@params.SplitMode.HasValue)
result.split_mode = @params.SplitMode.Value;

result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
result.vocab_only = @params.VocabOnly;
result.check_tensors = @params.CheckTensors;

unsafe
{
Expand Down
2 changes: 1 addition & 1 deletion LLama/Extensions/LLamaExecutorExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
Expand Down
3 changes: 0 additions & 3 deletions LLama/LLamaQuantizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
case LLamaFtype.MOSTLY_IQ3_S:
case LLamaFtype.MOSTLY_IQ3_M:

case LLamaFtype.MOSTLY_Q4_0_4_4:
case LLamaFtype.MOSTLY_Q4_0_4_8:
case LLamaFtype.MOSTLY_Q4_0_8_8:
return true;

case LLamaFtype.GUESSED:
Expand Down
36 changes: 36 additions & 0 deletions LLama/LLamaSharp.Runtime.targets
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/noavx/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/noavx/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/noavx/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/llama.dll</Link>
Expand All @@ -20,22 +29,49 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/llama.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx2/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/llama.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-base.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/ggml-base.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/ggml.dll</Link>
</None>
<None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-cpu.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/avx512/ggml-cpu.dll</Link>
</None>

<None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llama.dll">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Link>runtimes/win-x64/native/cuda11/llama.dll</Link>
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>c9b00a70b080d</BinaryReleaseId>
<BinaryReleaseId>d79d8f39b4da6</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
Expand Down
12 changes: 6 additions & 6 deletions LLama/LLamaStatelessExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ public class StatelessExecutor
private readonly ILogger? _logger;
private readonly LLamaBatch _batch;

// LLava Section
/// <inheritdoc />
public bool IsMultiModal => false;

/// <inheritdoc />
public LLavaWeights? ClipModel { get; }
public LLavaWeights? ClipModel => default;

/// <inheritdoc />
public List<byte[]> Images { get; set; }
public List<byte[]> Images { get; }

/// <summary>
/// The context used by the executor when running the inference.
Expand Down Expand Up @@ -68,7 +68,7 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
Context = context;

// Reset the sampling pipeline (if there is one)
inferenceParams?.SamplingPipeline?.Reset();
inferenceParams?.SamplingPipeline.Reset();

// Sanity check inference params
inferenceParams ??= new InferenceParams();
Expand Down Expand Up @@ -134,8 +134,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
var n_left = n_past - tokensKeep;
var n_discard = n_left / 2;

NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);

n_past -= n_discard;
}
Expand Down
14 changes: 7 additions & 7 deletions LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ public unsafe struct LLamaModelParams
{
/// <summary>
/// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
/// todo: add support for llama_model_params.devices
/// </summary>
//ggml_backend_dev_t* devices;
public IntPtr devices;
private IntPtr devices;

/// <summary>
/// // number of layers to store in VRAM
Expand All @@ -32,12 +32,12 @@ public unsafe struct LLamaModelParams
/// <summary>
/// how to split layers across multiple GPUs (size: <see cref="NativeApi.llama_max_devices"/>)
/// </summary>
public float* tensor_split;

/// <summary>
/// comma separated list of RPC servers to use for offloading
public float* tensor_split;
/// <summary>
/// comma separated list of RPC servers to use for offloading
/// </summary>
public byte* rpc_servers;
public byte* rpc_servers;

/// <summary>
/// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
Expand Down
4 changes: 2 additions & 2 deletions LLama/Native/Load/NativeLibraryConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,15 @@ internal Description CheckAndGatherDescription()
_avxLevel,
_allowFallback,
_skipCheck,
_searchDirectories.Concat(new[] { "./" }).ToArray()
_searchDirectories.Concat([ "./" ]).ToArray()
);
}

internal static string AvxLevelToString(AvxLevel level)
{
return level switch
{
AvxLevel.None => string.Empty,
AvxLevel.None => "noavx",
AvxLevel.Avx => "avx",
AvxLevel.Avx2 => "avx2",
AvxLevel.Avx512 => "avx512",
Expand Down
56 changes: 3 additions & 53 deletions LLama/Sampling/DefaultSamplingPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,44 +20,6 @@ public sealed class DefaultSamplingPipeline
/// </summary>
public float RepeatPenalty { get; init; } = 1;

/// <summary>
/// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
/// so far, decreasing the model's likelihood to repeat the same line verbatim.
/// </summary>
[Obsolete($"Use {nameof(FrequencyPenalty)} instead.")]
public float AlphaFrequency
{
get => _frequencyPenalty;
init
{
if (value < -2)
throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be greater than -2");
if (value > 2)
throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be less than 2");
_frequencyPenalty = value;
}
}

/// <summary>
/// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
/// text so far, increasing the model's likelihood to talk about new topics.
/// </summary>
[Obsolete($"Use {nameof(PresencePenalty)} instead.")]
public float AlphaPresence
{
get => _presencePenalty;
init
{
if (value < -2)
throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be greater than -2");
if (value > 2)
throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be less than 2");
_presencePenalty = value;
}
}

/// <summary>
/// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
Expand Down Expand Up @@ -97,21 +59,15 @@ public float PresencePenalty
private readonly float _presencePenalty;

/// <summary>
/// How many tokens should be considered for penalizing repetition
/// How many tokens should be considered for penalties
/// </summary>
public int RepeatPenaltyCount { get; init; } = 64;
public int PenaltyCount { get; init; } = 64;

/// <summary>
/// Whether the newline token should be protected from being modified by penalty
/// </summary>
public bool PenalizeNewline { get; init; } = false;

/// <summary>
/// Whether the EOS token should be protected from being modified by penalty
/// </summary>
[Obsolete($"This doesn't do what the name implies. If you're sure you want to use it, use {nameof(PreventEOS)}.")]
public bool PenalizeEOS { get; init; } = false;

/// <summary>
/// Whether the EOS token should be suppressed. Setting this to 'true' prevents EOS from being sampled
/// </summary>
Expand Down Expand Up @@ -196,13 +152,7 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
if (Grammar != null)
chain.AddGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root);

chain.AddPenalties(
context.VocabCount,
context.ModelHandle.Tokens.EOS, context.ModelHandle.Tokens.Newline ?? 0,
RepeatPenaltyCount, RepeatPenalty,
FrequencyPenalty, PresencePenalty,
PenalizeNewline, PreventEOS
);
chain.AddPenalties(PenaltyCount, RepeatPenalty, FrequencyPenalty, PresencePenalty);

chain.AddTopK(TopK);
chain.AddTypical(TypicalP, MinKeep);
Expand Down
15 changes: 11 additions & 4 deletions LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,20 @@
<files>
<file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cpu.props" />

<file src="runtimes/deps/ggml.dll" target="runtimes\win-x64\native\ggml.dll" />
<file src="runtimes/deps/ggml-base.dll" target="runtimes\win-x64\native\ggml-base.dll" />
<file src="runtimes/deps/ggml-cpu.dll" target="runtimes\win-x64\native\ggml-cpu.dll" />
<file src="runtimes/deps/llama.dll" target="runtimes\win-x64\native\llama.dll" />
<file src="runtimes/deps/avx/ggml.dll" target="runtimes\win-x64\native\avx\ggml.dll" />

<file src="runtimes/deps/avx/ggml-base.dll" target="runtimes\win-x64\native\avx\ggml-base.dll" />
<file src="runtimes/deps/avx/ggml-cpu.dll" target="runtimes\win-x64\native\avx\ggml-cpu.dll" />
<file src="runtimes/deps/avx/llama.dll" target="runtimes\win-x64\native\avx\llama.dll" />
<file src="runtimes/deps/avx2/ggml.dll" target="runtimes\win-x64\native\avx2\ggml.dll" />

<file src="runtimes/deps/avx2/ggml-base.dll" target="runtimes\win-x64\native\avx2\ggml-base.dll" />
<file src="runtimes/deps/avx2/ggml-cpu.dll" target="runtimes\win-x64\native\avx2\ggml-cpu.dll" />
<file src="runtimes/deps/avx2/llama.dll" target="runtimes\win-x64\native\avx2\llama.dll" />
<file src="runtimes/deps/avx512/ggml.dll" target="runtimes\win-x64\native\avx512\ggml.dll" />

<file src="runtimes/deps/avx512/ggml-base.dll" target="runtimes\win-x64\native\avx512\ggml-base.dll" />
<file src="runtimes/deps/avx512/ggml-cpu.dll" target="runtimes\win-x64\native\avx512\ggml-cpu.dll" />
<file src="runtimes/deps/avx512/llama.dll" target="runtimes\win-x64\native\avx512\llama.dll" />

<file src="runtimes/deps/libggml.so" target="runtimes\linux-x64\native\libggml.so" />
Expand Down