Skip to content

Commit

Permalink
Merge pull request #712 from martindevans/may-2024-binary-update
Browse files Browse the repository at this point in the history
May 2024 Binary Update (Take 2)
  • Loading branch information
martindevans authored May 12, 2024
2 parents d8514b3 + 52e4607 commit 9a6e8b5
Show file tree
Hide file tree
Showing 60 changed files with 1,627 additions and 1,783 deletions.
2 changes: 1 addition & 1 deletion LLama.Examples/Examples/BatchedExecutorGuidance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ await AnsiConsole
guidance.Prompt(g);

// Early exit if we reach the natural end of the guided sentence
if (g == model.Tokens.EOS)
if (model.Tokens.IsEndOfGeneration(g))
break;

// Update progress bar
Expand Down
4 changes: 2 additions & 2 deletions LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<Version>0.11.2</Version>
<Version>0.12.0</Version>
<Authors>Xbotter</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand All @@ -17,7 +17,7 @@
The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
</Description>
<PackageReleaseNotes>
v0.11.2 followed the updating of LLamaSharp.
v0.12.0 released with v0.12.0 of LLamaSharp.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
Expand Down
4 changes: 2 additions & 2 deletions LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>

<Version>0.11.2</Version>
<Version>0.12.0</Version>
<Authors>Tim Miller, Xbotter</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand All @@ -23,7 +23,7 @@
The integration of LLamaSharp and Microsoft semantic-kernel.
</Description>
<PackageReleaseNotes>
v0.11.2 followed the updating of LLamaSharp.
v0.12.0 released with v0.12.0 of LLamaSharp.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
Expand Down
9 changes: 7 additions & 2 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ public class ModelOptions
/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

public uint SeqMax { get; }
/// <inheritdoc />
public uint SeqMax { get; set; }

/// <inheritdoc />
public uint? Seed { get; set; } = 1686349486;

public bool Embeddings { get; }
/// <inheritdoc />
public bool Embeddings { get; set; }

/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;
Expand Down Expand Up @@ -102,6 +104,9 @@ public class ModelOptions
/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />
public bool FlashAttention { get; set; }

/// <inheritdoc />
public Encoding Encoding { get; set; } = Encoding.UTF8;

Expand Down
7 changes: 7 additions & 0 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,14 @@ public interface IContextParams
bool NoKqvOffload { get; }

/// <summary>
/// Whether to use flash attention
/// </summary>
bool FlashAttention { get; }

/// <summary>
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)

Check warning on line 119 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

XML comment has badly formed XML -- 'Expected an end tag for element 'summary'.'

Check warning on line 119 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

XML comment has badly formed XML -- 'Expected an end tag for element 'summary'.'

Check warning on line 119 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

XML comment has badly formed XML -- 'Expected an end tag for element 'summary'.'
/// </summary>

Check warning on line 120 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

XML comment has badly formed XML -- 'End tag was not expected at this location.'

Check warning on line 120 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

XML comment has badly formed XML -- 'End tag was not expected at this location.'

Check warning on line 120 in LLama/Abstractions/IContextParams.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

XML comment has badly formed XML -- 'End tag was not expected at this location.'
float? DefragThreshold { get; }

Expand Down
24 changes: 24 additions & 0 deletions LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.ComponentModel;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.Json.Serialization;
using LLama.Native;
Expand Down Expand Up @@ -241,6 +242,7 @@ public sealed record MetadataOverride
private readonly int _valueInt;
private readonly float _valueFloat;
private readonly bool _valueBool;
private readonly byte[]? _valueString;

/// <summary>
/// Create a new override for an int key
Expand Down Expand Up @@ -278,6 +280,21 @@ public MetadataOverride(string key, bool value)
Type = LLamaModelKvOverrideType.Bool;
}

/// <summary>
/// Create a new override for a string key
/// </summary>
/// <param name="key"></param>
/// <param name="value"></param>
public MetadataOverride(string key, string value)
{
Key = key;
_valueString = Encoding.UTF8.GetBytes(value);
Type = LLamaModelKvOverrideType.String;

if (_valueString.Length > 128)
throw new ArgumentException("Value string is too long, must be < 128 UTF8 bytes", nameof(value));
}

internal void WriteValue(ref LLamaModelMetadataOverride dest)
{
switch (Type)
Expand All @@ -291,6 +308,13 @@ internal void WriteValue(ref LLamaModelMetadataOverride dest)
case LLamaModelKvOverrideType.Bool:
dest.BoolValue = _valueBool ? -1L : 0;
break;
case LLamaModelKvOverrideType.String:
unsafe
{
fixed (byte* strValPtr = dest.StringValue)
new Span<byte>(_valueString!).CopyTo(new Span<byte>(strValPtr, 128));
}
break;
default:
throw new InvalidEnumArgumentException($"Unknown {nameof(LLamaModelKvOverrideType)} value: {Type}");
}
Expand Down
4 changes: 4 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ public record ModelParams
/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />

public bool FlashAttention { get; set; }

/// <inheritdoc />
public float? DefragThreshold { get; set; }

Expand Down
1 change: 1 addition & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
result.flash_attention = @params.FlashAttention;
result.llama_pooling_type = @params.PoolingType;

result.n_threads = Threads(@params.Threads);
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<Platforms>AnyCPU;x64;Arm64</Platforms>
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>

<Version>0.11.2</Version>
<Version>0.12.0</Version>
<Authors>Rinne, Martin Evans, jlsantiago and all the other contributors in https://github.com/SciSharp/LLamaSharp/graphs/contributors.</Authors>
<Company>SciSharp STACK</Company>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand All @@ -22,7 +22,7 @@
With the higher-level APIs and RAG support, it's convenient to deploy LLM (Large Language Model) in your application with LLamaSharp.
</Description>
<PackageReleaseNotes>
LLamaSharp 0.11.2 fixed the performance issue of LLaVA on GPU and improved the log suppression.
Updated llama.cpp version to include better support for LLama3 tokenization.
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageOutputPath>packages</PackageOutputPath>
Expand Down
5 changes: 3 additions & 2 deletions LLama/LLamaStatelessExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading;
using LLama.Exceptions;
using LLama.Native;
Expand Down Expand Up @@ -123,8 +124,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
);
}

// Check if this is the EOS token
if (id == _weights.Tokens.EOS)
// Check if this token should end generation
if (_weights.Tokens.IsEndOfGeneration(id))
break;

// Decode this token into text
Expand Down
10 changes: 10 additions & 0 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,16 @@ public bool offload_kqv
}
private sbyte _offload_kqv;

/// <summary>
/// whether to use flash attention
/// </summary>
public bool flash_attention
{
readonly get => Convert.ToBoolean(_flash_attention);
set => _flash_attention = Convert.ToSByte(value);
}
private sbyte _flash_attention;

//todo: implement abort callback support
/// <summary>
/// ggml_abort_callback
Expand Down
5 changes: 5 additions & 0 deletions LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ public enum LLamaFtype
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ1_M = 31,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_BF16 = 32,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
11 changes: 11 additions & 0 deletions LLama/Native/LLamaModelMetadataOverride.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ public unsafe struct LLamaModelMetadataOverride
/// </summary>
[FieldOffset(136)]
public long BoolValue;

/// <summary>
/// Value, **must** only be used if Tag == String
/// </summary>
[FieldOffset(136)]
public fixed byte StringValue[128];
}

/// <summary>
Expand All @@ -65,4 +71,9 @@ public enum LLamaModelKvOverrideType
/// Overriding a bool value
/// </summary>
Bool = 2,

/// <summary>
/// Overriding a string value
/// </summary>
String = 3,
}
10 changes: 10 additions & 0 deletions LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ public bool use_mlock
}
private sbyte _use_mlock;

/// <summary>
/// validate model tensor data
/// </summary>
public bool check_tensors
{
readonly get => Convert.ToBoolean(_check_tensors);
set => _check_tensors = Convert.ToSByte(value);
}
private sbyte _check_tensors;

/// <summary>
/// Create a LLamaModelParams with default values
/// </summary>
Expand Down
10 changes: 10 additions & 0 deletions LLama/Native/LLamaModelQuantizeParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ public bool pure
}
private sbyte _pure;

/// <summary>
/// quantize to the same number of shards
/// </summary>
public bool keep_split
{
get => Convert.ToBoolean(_keep_split);
set => _keep_split = Convert.ToSByte(value);
}
private sbyte _keep_split;

/// <summary>
/// pointer to importance matrix data
/// </summary>
Expand Down
17 changes: 17 additions & 0 deletions LLama/Native/LLamaVocabPreType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
namespace LLama.Native;

/// <summary>
///
/// </summary>
/// <remarks>llama_vocab_pre_type</remarks>
internal enum LLamaVocabPreType
{
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
}
5 changes: 3 additions & 2 deletions LLama/Native/NativeApi.LLava.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public static unsafe partial class NativeApi
/// <param name="ctxClip">Llava Model</param>
/// <returns>True if validate successfully</returns>
[DllImport(llavaLibraryName, EntryPoint = "llava_validate_embed_size", CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llava_validate_embed_size( SafeLLamaContextHandle ctxLlama, SafeLlavaModelHandle ctxClip);

/// <summary>
Expand Down Expand Up @@ -56,7 +57,7 @@ SafeLlavaImageEmbedHandle llava_image_embed_make_with_filename(SafeLlavaModelHan
/// <param name="embed">Embedding handle</param>
/// <returns>True on success</returns>
[DllImport(llavaLibraryName, EntryPoint = "llava_eval_image_embed", CallingConvention = CallingConvention.Cdecl)]
public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed,
int n_batch, ref int n_past);
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed, int n_batch, ref int n_past);

}
2 changes: 1 addition & 1 deletion LLama/Native/NativeApi.Sampling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Span<
public static extern LLamaToken llama_sample_token_greedy(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);

/// <summary>
/// Randomly selects a token from the candidates based on their probabilities.
/// Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">Pointer to LLamaTokenDataArray</param>
Expand Down
Loading

0 comments on commit 9a6e8b5

Please sign in to comment.