Skip to content

May 2024 Binary Update (Take 2) #712

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LLama.Examples/Examples/BatchedExecutorGuidance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ await AnsiConsole
guidance.Prompt(g);

// Early exit if we reach the natural end of the guided sentence
if (g == model.Tokens.EOS)
if (model.Tokens.IsEndOfGeneration(g))
break;

// Update progress bar
Expand Down
4 changes: 2 additions & 2 deletions LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Version>0.11.2</Version>
<Version>0.12.0</Version>
<Authors>Xbotter</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand All @@ -17,7 +17,7 @@
The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
</Description>
<PackageReleaseNotes>
v0.11.2 followed the updating of LLamaSharp.
v0.12.0 released with v0.12.0 of LLamaSharp.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
Expand Down
4 changes: 2 additions & 2 deletions LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>

<Version>0.11.2</Version>
<Version>0.12.0</Version>
<Authors>Tim Miller, Xbotter</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand All @@ -23,7 +23,7 @@
The integration of LLamaSharp and Microsoft semantic-kernel.
</Description>
<PackageReleaseNotes>
v0.11.2 followed the updating of LLamaSharp.
v0.12.0 released with v0.12.0 of LLamaSharp.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
Expand Down
9 changes: 7 additions & 2 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ public class ModelOptions
/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

public uint SeqMax { get; }
/// <inheritdoc />
public uint SeqMax { get; set; }

/// <inheritdoc />
public uint? Seed { get; set; } = 1686349486;

public bool Embeddings { get; }
/// <inheritdoc />
public bool Embeddings { get; set; }

/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;
Expand Down Expand Up @@ -102,6 +104,9 @@ public class ModelOptions
/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />
public bool FlashAttention { get; set; }

/// <inheritdoc />
public Encoding Encoding { get; set; } = Encoding.UTF8;

Expand Down
7 changes: 7 additions & 0 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,15 @@
bool NoKqvOffload { get; }

/// <summary>
/// Whether to use flash attention
/// </summary>
bool FlashAttention { get; }

/// <summary>
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)

Check warning on line 119 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

XML comment has badly formed XML -- 'Expected an end tag for element 'summary'.'

Check warning on line 119 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

XML comment has badly formed XML -- 'Expected an end tag for element 'summary'.'

Check warning on line 119 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

XML comment has badly formed XML -- 'Expected an end tag for element 'summary'.'
/// </summary>

Check warning on line 120 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

XML comment has badly formed XML -- 'End tag was not expected at this location.'

Check warning on line 120 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

XML comment has badly formed XML -- 'End tag was not expected at this location.'

Check warning on line 120 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

XML comment has badly formed XML -- 'End tag was not expected at this location.'
float? DefragThreshold { get; }

/// <summary>
Expand Down
24 changes: 24 additions & 0 deletions LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.ComponentModel;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.Json.Serialization;
using LLama.Native;
Expand Down Expand Up @@ -241,6 +242,7 @@ public sealed record MetadataOverride
private readonly int _valueInt;
private readonly float _valueFloat;
private readonly bool _valueBool;
private readonly byte[]? _valueString;

/// <summary>
/// Create a new override for an int key
Expand Down Expand Up @@ -278,6 +280,21 @@ public MetadataOverride(string key, bool value)
Type = LLamaModelKvOverrideType.Bool;
}

/// <summary>
/// Create a new override for a string key
/// </summary>
/// <param name="key"></param>
/// <param name="value"></param>
public MetadataOverride(string key, string value)
{
Key = key;
_valueString = Encoding.UTF8.GetBytes(value);
Type = LLamaModelKvOverrideType.String;

if (_valueString.Length > 128)
throw new ArgumentException("Value string is too long, must be < 128 UTF8 bytes", nameof(value));
}

internal void WriteValue(ref LLamaModelMetadataOverride dest)
{
switch (Type)
Expand All @@ -291,6 +308,13 @@ internal void WriteValue(ref LLamaModelMetadataOverride dest)
case LLamaModelKvOverrideType.Bool:
dest.BoolValue = _valueBool ? -1L : 0;
break;
case LLamaModelKvOverrideType.String:
unsafe
{
fixed (byte* strValPtr = dest.StringValue)
new Span<byte>(_valueString!).CopyTo(new Span<byte>(strValPtr, 128));
}
break;
default:
throw new InvalidEnumArgumentException($"Unknown {nameof(LLamaModelKvOverrideType)} value: {Type}");
}
Expand Down
4 changes: 4 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ public record ModelParams
/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />

public bool FlashAttention { get; set; }

/// <inheritdoc />
public float? DefragThreshold { get; set; }

Expand Down
1 change: 1 addition & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
result.flash_attention = @params.FlashAttention;
result.llama_pooling_type = @params.PoolingType;

result.n_threads = Threads(@params.Threads);
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<Platforms>AnyCPU;x64;Arm64</Platforms>
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>

<Version>0.11.2</Version>
<Version>0.12.0</Version>
<Authors>Rinne, Martin Evans, jlsantiago and all the other contributors in https://github.com/SciSharp/LLamaSharp/graphs/contributors.</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand All @@ -22,7 +22,7 @@
With the higher-level APIs and RAG support, it's convenient to deploy LLM (Large Language Model) in your application with LLamaSharp.
</Description>
<PackageReleaseNotes>
LLamaSharp 0.11.2 fixed the performance issue of LLaVA on GPU and improved the log suppression.
Updated llama.cpp version to include better support for LLama3 tokenization.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
Expand Down
5 changes: 3 additions & 2 deletions LLama/LLamaStatelessExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading;
using LLama.Exceptions;
using LLama.Native;
Expand Down Expand Up @@ -123,8 +124,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
);
}

// Check if this is the EOS token
if (id == _weights.Tokens.EOS)
// Check if this token should end generation
if (_weights.Tokens.IsEndOfGeneration(id))
break;

// Decode this token into text
Expand Down
10 changes: 10 additions & 0 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,16 @@ public bool offload_kqv
}
private sbyte _offload_kqv;

/// <summary>
/// whether to use flash attention
/// </summary>
public bool flash_attention
{
readonly get => Convert.ToBoolean(_flash_attention);
set => _flash_attention = Convert.ToSByte(value);
}
private sbyte _flash_attention;

//todo: implement abort callback support
/// <summary>
/// ggml_abort_callback
Expand Down
5 changes: 5 additions & 0 deletions LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ public enum LLamaFtype
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ1_M = 31,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_BF16 = 32,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
11 changes: 11 additions & 0 deletions LLama/Native/LLamaModelMetadataOverride.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ public unsafe struct LLamaModelMetadataOverride
/// </summary>
[FieldOffset(136)]
public long BoolValue;

/// <summary>
/// Value, **must** only be used if Tag == String
/// </summary>
[FieldOffset(136)]
public fixed byte StringValue[128];
}

/// <summary>
Expand All @@ -65,4 +71,9 @@ public enum LLamaModelKvOverrideType
/// Overriding a bool value
/// </summary>
Bool = 2,

/// <summary>
/// Overriding a string value
/// </summary>
String = 3,
}
10 changes: 10 additions & 0 deletions LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ public bool use_mlock
}
private sbyte _use_mlock;

/// <summary>
/// validate model tensor data
/// </summary>
public bool check_tensors
{
readonly get => Convert.ToBoolean(_check_tensors);
set => _check_tensors = Convert.ToSByte(value);
}
private sbyte _check_tensors;

/// <summary>
/// Create a LLamaModelParams with default values
/// </summary>
Expand Down
10 changes: 10 additions & 0 deletions LLama/Native/LLamaModelQuantizeParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ public bool pure
}
private sbyte _pure;

/// <summary>
/// quantize to the same number of shards
/// </summary>
public bool keep_split
{
get => Convert.ToBoolean(_keep_split);
set => _keep_split = Convert.ToSByte(value);
}
private sbyte _keep_split;

/// <summary>
/// pointer to importance matrix data
/// </summary>
Expand Down
17 changes: 17 additions & 0 deletions LLama/Native/LLamaVocabPreType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
namespace LLama.Native;

/// <summary>
///
/// </summary>
/// <remarks>llama_vocab_pre_type</remarks>
internal enum LLamaVocabPreType
{
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
}
5 changes: 3 additions & 2 deletions LLama/Native/NativeApi.LLava.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public static unsafe partial class NativeApi
/// <param name="ctxClip">Llava Model</param>
/// <returns>True if validate successfully</returns>
[DllImport(llavaLibraryName, EntryPoint = "llava_validate_embed_size", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llava_validate_embed_size( SafeLLamaContextHandle ctxLlama, SafeLlavaModelHandle ctxClip);

/// <summary>
Expand Down Expand Up @@ -56,7 +57,7 @@ SafeLlavaImageEmbedHandle llava_image_embed_make_with_filename(SafeLlavaModelHan
/// <param name="embed">Embedding handle</param>
/// <returns>True on success</returns>
[DllImport(llavaLibraryName, EntryPoint = "llava_eval_image_embed", CallingConvention = CallingConvention.Cdecl)]
public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed,
int n_batch, ref int n_past);
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed, int n_batch, ref int n_past);

}
2 changes: 1 addition & 1 deletion LLama/Native/NativeApi.Sampling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Span<
public static extern LLamaToken llama_sample_token_greedy(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);

/// <summary>
/// Randomly selects a token from the candidates based on their probabilities.
/// Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">Pointer to LLamaTokenDataArray</param>
Expand Down
Loading
Loading