diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs index d7475c6e7..fca00d3e9 100644 --- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs +++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs @@ -103,7 +103,7 @@ public void GlobalSetup() { var showLLamaCppLogs = true; NativeLibraryConfig - .Instance + .All .WithLogCallback((level, message) => { if (showLLamaCppLogs) diff --git a/LLama.Experimental/LLama.Experimental.csproj b/LLama.Experimental/LLama.Experimental.csproj index 80d84084d..e313a368f 100644 --- a/LLama.Experimental/LLama.Experimental.csproj +++ b/LLama.Experimental/LLama.Experimental.csproj @@ -7,7 +7,7 @@ 12 LLama - 0.12.0 + 0.13.0 Rinne SciSharp STACK true diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index 5f267c0dd..240a928bf 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -4,7 +4,7 @@ net6.0;net8.0 enable enable - 0.12.0 + 0.13.0 Xbotter SciSharp STACK true diff --git a/LLama.KernelMemory/LlamaSharpConfig.cs b/LLama.KernelMemory/LlamaSharpConfig.cs index e5fc4bf16..78e6ba379 100644 --- a/LLama.KernelMemory/LlamaSharpConfig.cs +++ b/LLama.KernelMemory/LlamaSharpConfig.cs @@ -1,10 +1,5 @@ -using LLama.Common; +using LLama.Common; using LLama.Native; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace LLamaSharp.KernelMemory { diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index e3d18b3c0..1d80d7c57 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -1,4 +1,4 @@ -using LLama; +using LLama; using LLama.Common; using LLama.Native; using Microsoft.KernelMemory.AI; diff --git a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj index f6f91673f..fb85285f2 100644 --- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj +++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj @@ -10,7 +10,7 @@ enable enable - 0.12.0 + 0.13.0 Tim Miller, Xbotter SciSharp STACK true @@ -45,7 +45,7 @@ - + diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs index f93b2145b..8aa7d52b7 100644 --- a/LLama/Abstractions/IContextParams.cs +++ b/LLama/Abstractions/IContextParams.cs @@ -116,7 +116,6 @@ public interface IContextParams /// /// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default) /// defragment the KV cache if holes/size > defrag_threshold, Set to or < 0 to disable (default) - /// float? DefragThreshold { get; } diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs index 2b1e1679d..25280e71b 100644 --- a/LLama/Abstractions/IModelParams.cs +++ b/LLama/Abstractions/IModelParams.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Buffers; using System.Collections; using System.Collections.Generic; diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs index b6eedfb20..05aeb5463 100644 --- a/LLama/LLamaContext.cs +++ b/LLama/LLamaContext.cs @@ -9,7 +9,6 @@ using LLama.Common; using System.Runtime.InteropServices; using System.Threading.Tasks; -using LLama.Extensions; using LLama.Abstractions; using LLama.Sampling; using Microsoft.Extensions.Logging; @@ -56,20 +55,13 @@ public sealed class LLamaContext /// public Encoding Encoding { get; } - private uint _generationThreads; - private uint _batchThreads; - /// /// Get or set the number of threads to use for generation /// public uint GenerationThreads { - get => _generationThreads; - set - { - _generationThreads = value; - NativeHandle.SetThreads(_generationThreads, _batchThreads); - } + get => NativeHandle.GenerationThreads; + set => NativeHandle.GenerationThreads = value; } /// @@ -77,12 +69,8 @@ public uint GenerationThreads /// public uint BatchThreads { - get => _batchThreads; - set - { - _batchThreads = value; - NativeHandle.SetThreads(_generationThreads, _batchThreads); - } + get => NativeHandle.BatchThreads; + set => NativeHandle.BatchThreads = value; } /// @@ -111,10 +99,6 @@ public LLamaContext(LLamaWeights model, IContextParams @params, ILogger? logger @params.ToLlamaContextParams(out var lparams); NativeHandle = SafeLLamaContextHandle.Create(model.NativeHandle, lparams); - - // It's not possible to get these values from llama.cpp, store a copy of them here. - _generationThreads = lparams.n_threads; - _batchThreads = lparams.n_threads_batch; } /// diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index f6e18e21a..0f59807b4 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -7,7 +7,7 @@ AnyCPU;x64;Arm64 True - 0.12.0 + 0.13.0 Rinne, Martin Evans, jlsantiago and all the other contributors in https://github.com/SciSharp/LLamaSharp/graphs/contributors. SciSharp STACK true diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs index 96957d0f2..54fa095c1 100644 --- a/LLama/Native/GPUSplitMode.cs +++ b/LLama/Native/GPUSplitMode.cs @@ -1,4 +1,4 @@ -namespace LLama.Native; +namespace LLama.Native; /// /// diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index bbece4648..cf0861f56 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Runtime.InteropServices; namespace LLama.Native @@ -27,7 +27,12 @@ public unsafe struct LLamaModelParams /// /// how to split layers across multiple GPUs (size: ) /// - public float* tensor_split; + public float* tensor_split; + + /// + /// comma separated list of RPC servers to use for offloading + /// + public byte* rpc_servers; /// /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs index 0d31d4347..b22500772 100644 --- a/LLama/Native/LLamaVocabPreType.cs +++ b/LLama/Native/LLamaVocabPreType.cs @@ -1,4 +1,4 @@ -namespace LLama.Native; +namespace LLama.Native; /// /// @@ -6,12 +6,19 @@ /// llama_vocab_pre_type internal enum LLamaVocabPreType { - LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, - LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, - LLAMA_VOCAB_PRE_TYPE_FALCON = 4, - LLAMA_VOCAB_PRE_TYPE_MPT = 5, - LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, - LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + Default = 0, + + LLAMA3 = 1, + DEEPSEEK_LLM = 2, + DEEPSEEK_CODER = 3, + FALCON = 4, + MPT = 5, + STARCODER = 6, + GPT2 = 7, + REFACT = 8, + COMMAND_R = 9, + STABLELM2 = 10, + QWEN2 = 11, + OLMO = 12, + DBRX = 13, } \ No newline at end of file diff --git a/LLama/Native/Load/NativeLibraryMetadata.cs b/LLama/Native/Load/NativeLibraryMetadata.cs index 654c9002f..66c546e6b 100644 --- a/LLama/Native/Load/NativeLibraryMetadata.cs +++ b/LLama/Native/Load/NativeLibraryMetadata.cs @@ -1,4 +1,4 @@ - + namespace LLama.Native { /// diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 96453cb4f..f54a8680b 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -41,6 +41,24 @@ public sealed class SafeLLamaContextHandle /// public uint UBatchSize => llama_n_ubatch(this); + /// + /// Get or set the number of threads used for generation of a single token. + /// + public uint GenerationThreads + { + get => llama_n_threads(this); + set => llama_set_n_threads(this, value, BatchThreads); + } + + /// + /// Get or set the number of threads used for prompt and batch processing (multiple token). + /// + public uint BatchThreads + { + get => llama_n_threads_batch(this); + set => llama_set_n_threads(this, GenerationThreads, value); + } + /// /// Get the model which this context is using /// @@ -157,6 +175,22 @@ static SafeLLamaContextHandle() [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern void llama_set_n_threads(SafeLLamaContextHandle ctx, uint n_threads, uint n_threads_batch); + /// + /// Get the number of threads used for generation of a single token. + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern uint llama_n_threads(SafeLLamaContextHandle ctx); + + /// + /// Get the number of threads used for prompt and batch processing (multiple token). + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern uint llama_n_threads_batch(SafeLLamaContextHandle ctx); + /// /// Token logits obtained from the last call to llama_decode /// The logits for the last token are stored in the last row @@ -538,6 +572,7 @@ public void SetSeed(uint seed) /// /// n_threads is the number of threads used for generation (single token) /// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) + [Obsolete("Use `GenerationThreads` and `BatchThreads` properties")] public void SetThreads(uint threads, uint threadsBatch) { llama_set_n_threads(this, threads, threadsBatch); @@ -613,7 +648,7 @@ public int KvCacheCountTokens() } /// - /// Clear the KV cache + /// Clear the KV cache - both cell info is erased and KV data is zeroed /// public void KvCacheClear() { diff --git a/LLama/Native/llama_vocab_pre_type.cs b/LLama/Native/llama_vocab_pre_type.cs deleted file mode 100644 index 7f08fb35c..000000000 --- a/LLama/Native/llama_vocab_pre_type.cs +++ /dev/null @@ -1,27 +0,0 @@ -namespace LLama.Native; - -///// -///// pre-tokenization type -///// -///// llama_vocab_pre_type -//public enum llama_vocab_pre_type -//{ -// /// -// /// Default pre tokenization type -// /// -// /// LLAMA_VOCAB_PRE_TYPE_DEFAULT -// Default = 0, -// -// LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, -// LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, -// LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, -// LLAMA_VOCAB_PRE_TYPE_FALCON = 4, -// LLAMA_VOCAB_PRE_TYPE_MPT = 5, -// LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, -// LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, -// LLAMA_VOCAB_PRE_TYPE_REFACT = 8, -// LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, -// LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10, -// LLAMA_VOCAB_PRE_TYPE_OLMO = 11, -// LLAMA_VOCAB_PRE_TYPE_DBRX = 12, -//} \ No newline at end of file diff --git a/LLama/runtimes/deps/avx/libllama.dll b/LLama/runtimes/deps/avx/libllama.dll index 6eac478ab..da50eef3b 100644 Binary files a/LLama/runtimes/deps/avx/libllama.dll and b/LLama/runtimes/deps/avx/libllama.dll differ diff --git a/LLama/runtimes/deps/avx/libllama.so b/LLama/runtimes/deps/avx/libllama.so index b4d2fb9c5..367a44294 100644 Binary files a/LLama/runtimes/deps/avx/libllama.so and b/LLama/runtimes/deps/avx/libllama.so differ diff --git a/LLama/runtimes/deps/avx/libllava_shared.so b/LLama/runtimes/deps/avx/libllava_shared.so index d1ef24e17..4164ad326 100644 Binary files a/LLama/runtimes/deps/avx/libllava_shared.so and b/LLama/runtimes/deps/avx/libllava_shared.so differ diff --git a/LLama/runtimes/deps/avx/llama.dll b/LLama/runtimes/deps/avx/llama.dll index 6eac478ab..da50eef3b 100644 Binary files a/LLama/runtimes/deps/avx/llama.dll and b/LLama/runtimes/deps/avx/llama.dll differ diff --git a/LLama/runtimes/deps/avx/llava_shared.dll b/LLama/runtimes/deps/avx/llava_shared.dll index 5d1b67a93..c8241d338 100644 Binary files a/LLama/runtimes/deps/avx/llava_shared.dll and b/LLama/runtimes/deps/avx/llava_shared.dll differ diff --git a/LLama/runtimes/deps/avx2/libllama.dll b/LLama/runtimes/deps/avx2/libllama.dll index 23de7074c..62a95e4c6 100644 Binary files a/LLama/runtimes/deps/avx2/libllama.dll and b/LLama/runtimes/deps/avx2/libllama.dll differ diff --git a/LLama/runtimes/deps/avx2/libllama.so b/LLama/runtimes/deps/avx2/libllama.so index f3eea88b4..448f43356 100644 Binary files a/LLama/runtimes/deps/avx2/libllama.so and b/LLama/runtimes/deps/avx2/libllama.so differ diff --git a/LLama/runtimes/deps/avx2/libllava_shared.so b/LLama/runtimes/deps/avx2/libllava_shared.so index 5d55bfa5d..f187a613b 100644 Binary files a/LLama/runtimes/deps/avx2/libllava_shared.so and b/LLama/runtimes/deps/avx2/libllava_shared.so differ diff --git a/LLama/runtimes/deps/avx2/llama.dll b/LLama/runtimes/deps/avx2/llama.dll index 23de7074c..62a95e4c6 100644 Binary files a/LLama/runtimes/deps/avx2/llama.dll and b/LLama/runtimes/deps/avx2/llama.dll differ diff --git a/LLama/runtimes/deps/avx2/llava_shared.dll b/LLama/runtimes/deps/avx2/llava_shared.dll index b286c4e54..d1e299a88 100644 Binary files a/LLama/runtimes/deps/avx2/llava_shared.dll and b/LLama/runtimes/deps/avx2/llava_shared.dll differ diff --git a/LLama/runtimes/deps/avx512/libllama.dll b/LLama/runtimes/deps/avx512/libllama.dll index d29a14f20..303abf0ee 100644 Binary files a/LLama/runtimes/deps/avx512/libllama.dll and b/LLama/runtimes/deps/avx512/libllama.dll differ diff --git a/LLama/runtimes/deps/avx512/libllama.so b/LLama/runtimes/deps/avx512/libllama.so index abfe110d3..a7d574697 100644 Binary files a/LLama/runtimes/deps/avx512/libllama.so and b/LLama/runtimes/deps/avx512/libllama.so differ diff --git a/LLama/runtimes/deps/avx512/libllava_shared.so b/LLama/runtimes/deps/avx512/libllava_shared.so index 4ff11d280..342d8b674 100644 Binary files a/LLama/runtimes/deps/avx512/libllava_shared.so and b/LLama/runtimes/deps/avx512/libllava_shared.so differ diff --git a/LLama/runtimes/deps/avx512/llama.dll b/LLama/runtimes/deps/avx512/llama.dll index d29a14f20..303abf0ee 100644 Binary files a/LLama/runtimes/deps/avx512/llama.dll and b/LLama/runtimes/deps/avx512/llama.dll differ diff --git a/LLama/runtimes/deps/avx512/llava_shared.dll b/LLama/runtimes/deps/avx512/llava_shared.dll index 088b1b8d2..3c102d9bd 100644 Binary files a/LLama/runtimes/deps/avx512/llava_shared.dll and b/LLama/runtimes/deps/avx512/llava_shared.dll differ diff --git a/LLama/runtimes/deps/clblast/libllama.so b/LLama/runtimes/deps/clblast/libllama.so index c3e6eb39d..ed1363fee 100644 Binary files a/LLama/runtimes/deps/clblast/libllama.so and b/LLama/runtimes/deps/clblast/libllama.so differ diff --git a/LLama/runtimes/deps/clblast/libllava_shared.so b/LLama/runtimes/deps/clblast/libllava_shared.so index 52b2483b2..df49fb9e9 100644 Binary files a/LLama/runtimes/deps/clblast/libllava_shared.so and b/LLama/runtimes/deps/clblast/libllava_shared.so differ diff --git a/LLama/runtimes/deps/clblast/llama.dll b/LLama/runtimes/deps/clblast/llama.dll index d7158fcd8..f2e0b1f6d 100644 Binary files a/LLama/runtimes/deps/clblast/llama.dll and b/LLama/runtimes/deps/clblast/llama.dll differ diff --git a/LLama/runtimes/deps/clblast/llava_shared.dll b/LLama/runtimes/deps/clblast/llava_shared.dll index 2eb43fd15..27a059926 100644 Binary files a/LLama/runtimes/deps/clblast/llava_shared.dll and b/LLama/runtimes/deps/clblast/llava_shared.dll differ diff --git a/LLama/runtimes/deps/cu11.7.1/libllama.so b/LLama/runtimes/deps/cu11.7.1/libllama.so index 955355d2e..6abe443ba 100644 Binary files a/LLama/runtimes/deps/cu11.7.1/libllama.so and b/LLama/runtimes/deps/cu11.7.1/libllama.so differ diff --git a/LLama/runtimes/deps/cu11.7.1/libllava_shared.so b/LLama/runtimes/deps/cu11.7.1/libllava_shared.so index a9fe23026..bca6fa5e3 100644 Binary files a/LLama/runtimes/deps/cu11.7.1/libllava_shared.so and b/LLama/runtimes/deps/cu11.7.1/libllava_shared.so differ diff --git a/LLama/runtimes/deps/cu11.7.1/llama.dll b/LLama/runtimes/deps/cu11.7.1/llama.dll index 0d18a43d0..8f9c8cfd3 100644 Binary files a/LLama/runtimes/deps/cu11.7.1/llama.dll and b/LLama/runtimes/deps/cu11.7.1/llama.dll differ diff --git a/LLama/runtimes/deps/cu11.7.1/llava_shared.dll b/LLama/runtimes/deps/cu11.7.1/llava_shared.dll index c93b5461f..e77a5a178 100644 Binary files a/LLama/runtimes/deps/cu11.7.1/llava_shared.dll and b/LLama/runtimes/deps/cu11.7.1/llava_shared.dll differ diff --git a/LLama/runtimes/deps/cu12.1.0/libllama.so b/LLama/runtimes/deps/cu12.1.0/libllama.so index 2f0311492..60526a4c0 100644 Binary files a/LLama/runtimes/deps/cu12.1.0/libllama.so and b/LLama/runtimes/deps/cu12.1.0/libllama.so differ diff --git a/LLama/runtimes/deps/cu12.1.0/libllava_shared.so b/LLama/runtimes/deps/cu12.1.0/libllava_shared.so index e7948cf79..cb18d6175 100644 Binary files a/LLama/runtimes/deps/cu12.1.0/libllava_shared.so and b/LLama/runtimes/deps/cu12.1.0/libllava_shared.so differ diff --git a/LLama/runtimes/deps/cu12.1.0/llama.dll b/LLama/runtimes/deps/cu12.1.0/llama.dll index ba15b7677..d41fc409b 100644 Binary files a/LLama/runtimes/deps/cu12.1.0/llama.dll and b/LLama/runtimes/deps/cu12.1.0/llama.dll differ diff --git a/LLama/runtimes/deps/cu12.1.0/llava_shared.dll b/LLama/runtimes/deps/cu12.1.0/llava_shared.dll index 5c5962104..e07582364 100644 Binary files a/LLama/runtimes/deps/cu12.1.0/llava_shared.dll and b/LLama/runtimes/deps/cu12.1.0/llava_shared.dll differ diff --git a/LLama/runtimes/deps/libllama.dll b/LLama/runtimes/deps/libllama.dll index b2d85078e..e0f45ce35 100644 Binary files a/LLama/runtimes/deps/libllama.dll and b/LLama/runtimes/deps/libllama.dll differ diff --git a/LLama/runtimes/deps/libllama.so b/LLama/runtimes/deps/libllama.so index 09a78f8f7..cbd91a15e 100644 Binary files a/LLama/runtimes/deps/libllama.so and b/LLama/runtimes/deps/libllama.so differ diff --git a/LLama/runtimes/deps/libllava_shared.so b/LLama/runtimes/deps/libllava_shared.so index 7ff06062d..58d93187f 100644 Binary files a/LLama/runtimes/deps/libllava_shared.so and b/LLama/runtimes/deps/libllava_shared.so differ diff --git a/LLama/runtimes/deps/llama.dll b/LLama/runtimes/deps/llama.dll index b2d85078e..e0f45ce35 100644 Binary files a/LLama/runtimes/deps/llama.dll and b/LLama/runtimes/deps/llama.dll differ diff --git a/LLama/runtimes/deps/llava_shared.dll b/LLama/runtimes/deps/llava_shared.dll index 43d55ab8d..a8e5e9d2e 100644 Binary files a/LLama/runtimes/deps/llava_shared.dll and b/LLama/runtimes/deps/llava_shared.dll differ diff --git a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal index 46c7d5039..8ff70d7a7 100644 --- a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal +++ b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal @@ -229,6 +229,13 @@ kernel void kernel_relu( dst[tpig] = max(0.0f, src0[tpig]); } +kernel void kernel_sigmoid( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig])); +} + kernel void kernel_tanh( device const float * src0, device float * dst, @@ -356,7 +363,6 @@ template kernel void kernel_soft_max( device const char * src0, device const char * src1, - device const char * src2, device char * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -378,10 +384,9 @@ kernel void kernel_soft_max( device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr; - device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr; device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - float slope = 0.0f; + float slope = 1.0f; // ALiBi if (max_bias > 0.0f) { @@ -397,7 +402,7 @@ kernel void kernel_soft_max( float lmax = -INFINITY; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)); } // find the max value in the block @@ -422,7 +427,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val); lsum += exp_psrc0; pdst[i00] = exp_psrc0; } @@ -461,7 +466,6 @@ template kernel void kernel_soft_max_4( device const char * src0, device const char * src1, - device const char * src2, device char * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -483,10 +487,9 @@ kernel void kernel_soft_max_4( device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr; - device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr; device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; - float slope = 0.0f; + float slope = 1.0f; if (max_bias > 0.0f) { const int64_t h = i02; @@ -501,7 +504,7 @@ kernel void kernel_soft_max_4( float4 lmax4 = -INFINITY; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -527,7 +530,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } @@ -1595,60 +1598,6 @@ kernel void kernel_mul_mv_f16_f32_l4( } } -kernel void kernel_alibi_f32( - device const float * src0, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne03, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant uint64_t & nb03, - constant int64_t & ne0, - constant int64_t & ne1, - constant int64_t & ne2, - constant int64_t & ne3, - constant uint64_t & nb0, - constant uint64_t & nb1, - constant uint64_t & nb2, - constant uint64_t & nb3, - constant float & m0, - constant float & m1, - constant int & n_heads_log2_floor, - uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 ntg[[threads_per_threadgroup]]) { - const int64_t i03 = tgpig[2]; - const int64_t i02 = tgpig[1]; - const int64_t i01 = tgpig[0]; - - const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - - const int64_t i3 = n / (ne2*ne1*ne0); - const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); - const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; - //const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); - - const int64_t k = i3*ne3 + i2; - - float m_k; - if (k < n_heads_log2_floor) { - m_k = pow(m0, k + 1); - } else { - m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1); - } - - device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1; - device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01; - for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { - const float src_v = *(device float *)(src_row + i00*nb00); - device float * dst_v = (device float *)(dst_row + i00*nb0); - *dst_v = i00 * m_k + src_v; - } -} - static float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / max(0.001f, high - low); return 1.0f - min(1.0f, max(0.0f, y)); @@ -1691,6 +1640,7 @@ static void rope_yarn_corr_dims( typedef void (rope_t)( device const void * src0, device const int32_t * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -1726,6 +1676,7 @@ template kernel void kernel_rope( device const void * src0, device const int32_t * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -1795,8 +1746,10 @@ kernel void kernel_rope( // simplified from `(ib * n_dims + ic) * inv_ndims` const float cur_rot = inv_ndims*ic - ib; + const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f; + + const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor; - const float theta = theta_0 * pow(freq_base, cur_rot); float cos_theta, sin_theta; rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); @@ -1903,7 +1856,10 @@ kernel void kernel_upscale_f32( constant uint64_t & nb1, constant uint64_t & nb2, constant uint64_t & nb3, - constant int32_t & sf, + constant float & sf0, + constant float & sf1, + constant float & sf2, + constant float & sf3, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -1912,15 +1868,17 @@ kernel void kernel_upscale_f32( const int64_t i2 = tgpig.y; const int64_t i1 = tgpig.x; - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1/sf; - - device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); - device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); + const int64_t i03 = i3/sf3; + const int64_t i02 = i2/sf2; + const int64_t i01 = i1/sf1; for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { - dst_ptr[i0] = src0_ptr[i0/sf]; + const int64_t i00 = i0/sf0; + + device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_ptr[0] = src0_ptr[0]; } } @@ -2100,29 +2058,29 @@ typedef void (flash_attn_ext_f16_t)( device const char * v, device const char * mask, device float * dst, - constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, constant uint64_t & nb03, - constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, constant uint64_t & nb13, - constant int64_t & ne31, + constant uint64_t & nb21, + constant uint64_t & nb22, + constant uint64_t & nb23, constant uint64_t & nb31, - constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, - constant int64_t & ne3, constant float & scale, + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, threadgroup half * shared, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], @@ -2138,29 +2096,29 @@ kernel void kernel_flash_attn_ext_f16( device const char * v, device const char * mask, device float * dst, - constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, constant uint64_t & nb03, - constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, constant uint64_t & nb13, - constant int64_t & ne31, + constant uint64_t & nb21, + constant uint64_t & nb22, + constant uint64_t & nb23, constant uint64_t & nb31, - constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, - constant int64_t & ne3, constant float & scale, + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, threadgroup half * shared [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], @@ -2225,10 +2183,6 @@ kernel void kernel_flash_attn_ext_f16( const short ne22 = ne12; const short ne23 = ne13; - const uint nb21 = nb11; - const uint nb22 = nb12; - const uint nb23 = nb13; - // broadcast const short rk2 = ne02/ne12; const short rk3 = ne03/ne13; @@ -2254,8 +2208,17 @@ kernel void kernel_flash_attn_ext_f16( // pointer to the mask device const half * mp = (device const half *) (mask + iq1*nb31); - // prepare diagonal scale matrix - simdgroup_float8x8 mscale(scale); + float slope = 1.0f; + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = iq2; + + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exph); + } // loop over the KV cache // each simdgroup handles blocks of Q rows and C columns @@ -2279,12 +2242,20 @@ kernel void kernel_flash_attn_ext_f16( simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); } - // mqk = mqk*scale + mask - simdgroup_half8x8 mm; - simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false); - simdgroup_multiply_accumulate(mqk, mqk, mscale, mm); - simdgroup_store(mqk, ss + 8*cc, TF, 0, false); + + const short tx = tiisg%4; + const short ty = tiisg/4; + + if (mask != q) { + // mqk = mqk*scale + mask*slope + ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0]; + ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1]; + } else { + // mqk = mqk*scale + ss[8*cc + ty*TF + 2*tx + 0] *= scale; + ss[8*cc + ty*TF + 2*tx + 1] *= scale; + } } } @@ -2456,29 +2427,29 @@ kernel void kernel_flash_attn_ext_vec_f16( device const char * v, device const char * mask, device float * dst, - constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant uint64_t & nb00, constant uint64_t & nb01, constant uint64_t & nb02, constant uint64_t & nb03, - constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, constant uint64_t & nb13, - constant int64_t & ne31, + constant uint64_t & nb21, + constant uint64_t & nb22, + constant uint64_t & nb23, constant uint64_t & nb31, - constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, - constant int64_t & ne3, constant float & scale, + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, threadgroup half * shared [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], @@ -2497,6 +2468,18 @@ kernel void kernel_flash_attn_ext_vec_f16( const short T = D + 2*nsg*SH; // shared memory size per query in (half) + float slope = 1.0f; + + // ALiBi + if (max_bias > 0.0f) { + const uint32_t h = iq2; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix @@ -2537,10 +2520,6 @@ kernel void kernel_flash_attn_ext_vec_f16( const short ne22 = ne12; const short ne23 = ne13; - const uint nb21 = nb11; - const uint nb22 = nb12; - const uint nb23 = nb13; - // broadcast const short rk2 = ne02/ne12; const short rk3 = ne03/ne13; @@ -2603,10 +2582,9 @@ kernel void kernel_flash_attn_ext_vec_f16( mqk += simd_shuffle_down(mqk, 2); mqk += simd_shuffle_down(mqk, 1); - // mqk = mqk*scale + mask + // mqk = mqk*scale + mask*slope if (tiisg == 0) { - float4 mm = (float4) mp4[ic/4 + cc]; - mqk = mqk*scale + mm; + mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f); ss4[cc] = mqk; } @@ -3408,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl( const int step = sizeof(block_q2_K) * nb; -#if QK_K == 256 const int ix = tiisg/8; // 0...3 const int it = tiisg%8; // 0...7 const int iq = it/4; // 0 or 1 @@ -3460,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl( y4 += 4 * QK_K; } -#else - const int ix = tiisg/2; // 0...15 - const int it = tiisg%2; // 0...1 - - device const float * y4 = y + ix * QK_K + 8 * it; - - for (int ib = ix; ib < nb; ib += 16) { - - float4 sumy = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; ++i) { - yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; - yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8]; - yl[i+16] = y4[i+32]; sumy[2] += yl[i+16]; - yl[i+24] = y4[i+48]; sumy[3] += yl[i+24]; - } - - device const uint8_t * sc = (device const uint8_t *)x[ib].scales; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; - device const half * dh = &x[ib].d; - - for (int row = 0; row < N_DST; row++) { - - float4 acc1 = {0.f, 0.f, 0.f, 0.f}; - float4 acc2 = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003); - acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300); - acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c); - acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00); - acc1[2] += yl[i+16] * (qs[i/2] & 0x0030); - acc2[2] += yl[i+17] * (qs[i/2] & 0x3000); - acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0); - acc2[3] += yl[i+25] * (qs[i/2] & 0xc000); - } - - float dall = dh[0]; - float dmin = dh[1]; - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f + - (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f + - (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f + - (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) - - dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4)); - - qs += step/2; - sc += step; - dh += step/2; - } - - y4 += 16 * QK_K; - } -#endif for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); @@ -3548,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32( kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } -#if QK_K == 256 void kernel_mul_mv_q3_K_f32_impl( device const void * src0, device const float * src1, @@ -3707,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl( } } } -#else -void kernel_mul_mv_q3_K_f32_impl( - device const void * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - - const int nb = ne00/QK_K; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - const int64_t im = tgpig.z; - - const int row = 2 * r0 + sgitg; - - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - - device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; - device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - - const int ix = tiisg/4; - const int il = 4 * (tiisg%4);// 0, 4, 8, 12 - const int iq = il/8; // 0, 0, 1, 1 - const int in = il%8; // 0, 4, 0, 4 - - float2 sum = {0.f, 0.f}; - - for (int i = ix; i < nb; i += 8) { - - const float d_all = (float)(x[i].d); - - device const uint16_t * q = (device const uint16_t *)(x[i].qs + il); - device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in); - device const uint16_t * s = (device const uint16_t *)(x[i].scales); - device const float * y = yy + i * QK_K + il; - - const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8); - const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f; - const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f; - const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f; - - for (int l = 0; l < 4; l += 2) { - const uint16_t hm = h[l/2] >> iq; - sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4)) - + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16)) - + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64)) - + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256)); - sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024)) - + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096)) - + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384)) - + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536)); - } - - } - const float sumf = sum[0] + sum[1] * 1.f/256.f; - - const float tot = simd_sum(sumf); - if (tiisg == 0) { - dst[r1*ne0 + im*ne0*ne1 + row] = tot; - } - -} -#endif [[host_name("kernel_mul_mv_q3_K_f32")]] kernel void kernel_mul_mv_q3_K_f32( @@ -3814,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32( kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } -#if QK_K == 256 void kernel_mul_mv_q4_K_f32_impl( device const void * src0, device const float * src1, @@ -3928,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl( } } } -#else -void kernel_mul_mv_q4_K_f32_impl( - device const void * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - - const int ix = tiisg/4; // 0...7 - const int it = tiisg%4; // 0...3 - - const int nb = ne00/QK_K; - const int r0 = tgpig.x; - const int r1 = tgpig.y; - const int im = tgpig.z; - const int first_row = r0 * N_DST; - const int ib_row = first_row * nb; - - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); - - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; - device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - - float yl[8]; - float yh[8]; - float sumf[N_DST]={0.f}, all_sum; - - const int step = sizeof(block_q4_K) * nb / 2; - - device const float * y4 = y + ix * QK_K + 8 * it; - - uint16_t sc16[4]; - - for (int ib = ix; ib < nb; ib += 8) { - - float2 sumy = {0.f, 0.f}; - for (int i = 0; i < 8; ++i) { - yl[i] = y4[i+ 0]; sumy[0] += yl[i]; - yh[i] = y4[i+32]; sumy[1] += yh[i]; - } - - device const uint16_t * sc = (device const uint16_t *)x[ib].scales; - device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it; - device const half * dh = x[ib].d; - - for (int row = 0; row < N_DST; row++) { - - sc16[0] = sc[0] & 0x000f; - sc16[1] = sc[0] & 0x0f00; - sc16[2] = sc[0] & 0x00f0; - sc16[3] = sc[0] & 0xf000; - - float2 acc1 = {0.f, 0.f}; - float2 acc2 = {0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - acc1[0] += yl[i+0] * (qs[i/2] & 0x000F); - acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00); - acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0); - acc2[1] += yh[i+1] * (qs[i/2] & 0xF000); - } - - float dall = dh[0]; - float dmin = dh[1]; - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] + - (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) - - dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f); - - qs += step; - sc += step; - dh += step; - } - - y4 += 8 * QK_K; - } - - for (int row = 0; row < N_DST; ++row) { - all_sum = simd_sum(sumf[row]); - if (tiisg == 0) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; - } - } -} -#endif [[host_name("kernel_mul_mv_q4_K_f32")]] kernel void kernel_mul_mv_q4_K_f32( @@ -4092,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl( const int step = sizeof(block_q5_K) * nb; -#if QK_K == 256 -# float yl[16], yh[16]; const uint16_t kmask1 = 0x3f3f; @@ -4176,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl( y1 += 4 * QK_K; } -#else - float yl[8], yh[8]; - - const int il = 4 * (tiisg/8); // 0, 4, 8, 12 - const int ix = tiisg%8; - const int iq = il/8; // 0, 0, 1, 1 - const int in = il%8; // 0, 4, 0, 4 - - device const float * y = yy + ix*QK_K + il; - - for (int i = ix; i < nb; i += 8) { - - for (int l = 0; l < 4; ++l) { - yl[l+0] = y[l+ 0]; - yl[l+4] = y[l+16]; - yh[l+0] = y[l+32]; - yh[l+4] = y[l+48]; - } - - device const half * dh = &x[i].d; - device const uint8_t * q = x[i].qs + il; - device const uint8_t * h = x[i].qh + in; - device const int8_t * s = x[i].scales; - - for (int row = 0; row < 2; ++row) { - - const float d = dh[0]; - - float2 acc = {0.f, 0.f}; - for (int l = 0; l < 4; ++l) { - const uint8_t hl = h[l] >> iq; - acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16)) - + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16)); - acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256)) - + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256)); - } - sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]); - - q += step; - h += step; - s += step; - dh += step/2; - - } - - y += 8 * QK_K; - } -#endif for (int row = 0; row < 2; ++row) { const float tot = simd_sum(sumf[row]); @@ -4302,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl( float sumf = 0; -#if QK_K == 256 const int tid = tiisg/2; const int ix = tiisg%2; const int ip = tid/8; // 0 or 1 @@ -4338,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl( } -#else - const int ix = tiisg/4; - const int il = 4*(tiisg%4); - - for (int i = ix; i < nb; i += 8) { - device const float * y = yy + i * QK_K + il; - device const uint8_t * ql = x[i].ql + il; - device const uint8_t * qh = x[i].qh + il; - device const int8_t * s = x[i].scales; - - const float d = x[i].d; - - float4 sums = {0.f, 0.f, 0.f, 0.f}; - for (int l = 0; l < 4; ++l) { - sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); - sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); - sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32); - sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); - } - sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]); - } - -#endif - const float tot = simd_sum(sumf); if (tiisg == 0) { dst[r1*ne0 + im*ne0*ne1 + row] = tot; @@ -5195,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl( device const float * y4 = y + 32 * ix; -#if QK_K != 64 iq1m_scale_t scale; -#endif for (int ib32 = ix; ib32 < nb32; ib32 += 32) { @@ -5218,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl( device const uint16_t * sc = (device const uint16_t *)xr->scales; for (int row = 0; row < N_DST; row++) { - -#if QK_K != 64 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); -#endif constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); @@ -5237,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl( } const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); -#if QK_K == 64 - const float d = (float) *((device const half *)(sc - 1)); - sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) + - (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1)); -#else + sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); -#endif sc += nb*sizeof(block_iq1_m)/2; qs += nb*sizeof(block_iq1_m); @@ -5356,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl( } } -#if QK_K != 64 void kernel_mul_mv_iq4_xs_f32_impl( device const void * src0, device const float * src1, @@ -5451,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl( } } } -#endif [[host_name("kernel_mul_mv_iq1_s_f32")]] kernel void kernel_mul_mv_iq1_s_f32( @@ -5564,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { -#if QK_K == 64 - kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); -#else kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); -#endif } //============================= templates and their specializations ============================= @@ -5694,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg float dl, ml; uint8_t sc = xb->scales[il]; -#if QK_K == 256 q = q + 32*(il/8) + 16*(il&1); il = (il/2)%4; -#endif + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); @@ -5713,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg device const uint8_t * h = (device const uint8_t *)xb->hmask; device const int8_t * scales = (device const int8_t *)xb->scales; -#if QK_K == 256 q = q + 32 * (il/8) + 16 * (il&1); h = h + 16 * (il&1); uint8_t m = 1 << (il/2); @@ -5734,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); } -#else - float kcoef = il&1 ? 1.f/16.f : 1.f; - uint16_t kmask = il&1 ? 0xF0 : 0x0F; - float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8); - float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); - uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); - uint8_t m = 1<<(il*2); - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef)); - } -#endif } static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { @@ -5756,7 +5401,6 @@ template void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { device const uchar * q = xb->qs; -#if QK_K == 256 short is = (il/4) * 2; q = q + (il/4) * 32 + 16 * (il&1); il = il & 3; @@ -5765,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg const float min = xb->dmin; const float dl = d * sc[0]; const float ml = min * sc[1]; -#else - (void) get_scale_min_k4_just2; - - q = q + 16 * (il&1); - device const uint8_t * s = xb->scales; - device const half2 * dh = (device const half2 *)xb->d; - const float2 d = (float2)dh[0]; - const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; - const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4); -#endif + const ushort mask = il<2 ? 0x0F : 0xF0; for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * (q[i] & mask) - ml; @@ -5786,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg device const uint8_t * q = xb->qs; device const uint8_t * qh = xb->qh; -#if QK_K == 256 short is = (il/4) * 2; q = q + 32 * (il/4) + 16 * (il&1); qh = qh + 16 * (il&1); @@ -5803,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg for (int i = 0; i < 16; ++i) { reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; } -#else - q = q + 16 * (il&1); - device const int8_t * s = xb->scales; - const float dl = xb->d * s[il]; - uint8_t m = 1<<(il*2); - const float coef = il<2 ? 1.f : 1.f/16.f; - const ushort mask = il<2 ? 0x0F : 0xF0; - for (int i = 0; i < 16; ++i) { - reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef)); - } -#endif } template @@ -5823,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg device const uint8_t * qh = (device const uint8_t *)xb->qh; device const int8_t * scales = (device const int8_t *)xb->scales; -#if QK_K == 256 ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1); float sc = scales[(il%2) + 2 * ((il/2))]; il = (il/2) & 3; -#else - ql = ql + 16 * (il&1); - float sc = scales[il]; -#endif + const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; const float coef = il>1 ? 1.f/16.f : 1.f; @@ -5988,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & const int ib32 = il/2; il = il%2; device const uint16_t * sc = (device const uint16_t *)xb->scales; -#if QK_K == 64 - const float d = xb->d; -#else + iq1m_scale_t scale; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); const float d = scale.f16; -#endif + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; device const uint8_t * qh = xb->qh + 2*ib32 + il; -#if QK_K == 64 - const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1); -#else + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); -#endif const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); @@ -6031,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 template void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { -#if QK_K == 64 - dequantize_iq4_nl(xb, il, reg); -#else // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 const int ib32 = il/2; il = il%2; @@ -6050,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 reg[i][2] = d * kvalues_iq4nl_f[q8[2]]; reg[i][3] = d * kvalues_iq4nl_f[q8[3]]; } -#endif } template @@ -6555,11 +6165,7 @@ kernel void kernel_mul_mm_id( sgitg); } -#if QK_K == 256 #define QK_NL 16 -#else -#define QK_NL 4 -#endif // // get rows @@ -6599,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; -#if QK_K == 64 -template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; -#else template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; -#endif // // matrix-matrix multiplication @@ -6631,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; -#if QK_K == 64 -template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; -#else template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; -#endif // // indirect matrix-matrix multiplication @@ -6663,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#if QK_K == 64 -template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#else template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -#endif // // matrix-vector multiplication @@ -6876,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; -#if QK_K != 64 template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; -#endif diff --git a/LLama/runtimes/deps/osx-arm64/libllama.dylib b/LLama/runtimes/deps/osx-arm64/libllama.dylib index 61bc4fae2..93c632380 100644 Binary files a/LLama/runtimes/deps/osx-arm64/libllama.dylib and b/LLama/runtimes/deps/osx-arm64/libllama.dylib differ diff --git a/LLama/runtimes/deps/osx-arm64/libllava_shared.dylib b/LLama/runtimes/deps/osx-arm64/libllava_shared.dylib index 404833398..23026f8b0 100644 Binary files a/LLama/runtimes/deps/osx-arm64/libllava_shared.dylib and b/LLama/runtimes/deps/osx-arm64/libllava_shared.dylib differ diff --git a/LLama/runtimes/deps/osx-x64-rosetta2/libllama.dylib b/LLama/runtimes/deps/osx-x64-rosetta2/libllama.dylib new file mode 100644 index 000000000..27d21b2b0 Binary files /dev/null and b/LLama/runtimes/deps/osx-x64-rosetta2/libllama.dylib differ diff --git a/LLama/runtimes/deps/osx-x64-rosetta2/libllava_shared.dylib b/LLama/runtimes/deps/osx-x64-rosetta2/libllava_shared.dylib new file mode 100644 index 000000000..196ce165c Binary files /dev/null and b/LLama/runtimes/deps/osx-x64-rosetta2/libllava_shared.dylib differ diff --git a/LLama/runtimes/deps/osx-x64/libllama.dylib b/LLama/runtimes/deps/osx-x64/libllama.dylib index c803fd588..51efead48 100644 Binary files a/LLama/runtimes/deps/osx-x64/libllama.dylib and b/LLama/runtimes/deps/osx-x64/libllama.dylib differ diff --git a/LLama/runtimes/deps/osx-x64/libllava_shared.dylib b/LLama/runtimes/deps/osx-x64/libllava_shared.dylib index 922d4cd73..cd3604e26 100644 Binary files a/LLama/runtimes/deps/osx-x64/libllava_shared.dylib and b/LLama/runtimes/deps/osx-x64/libllava_shared.dylib differ diff --git a/README.md b/README.md index e76b5133d..8f6f33a7e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![logo](Assets/LLamaSharpLogo.png) +![logo](Assets/LLamaSharpLogo.png) [![Discord](https://img.shields.io/discord/1106946823282761851?label=Discord)](https://discord.gg/7wNVU65ZDY) [![QQ Group](https://img.shields.io/static/v1?label=QQ&message=加入QQ群&color=brightgreen)](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=sN9VVMwbWjs5L0ATpizKKxOcZdEPMrp8&authKey=RLDw41bLTrEyEgZZi%2FzT4pYk%2BwmEFgFcrhs8ZbkiVY7a4JFckzJefaYNW6Lk4yPX&noverify=0&group_code=985366726) @@ -249,7 +249,8 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) | | v0.10.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) | | v0.11.1, v0.11.2 | [LLaVA-v1.5](https://hf-mirror.com/jartine/llava-v1.5-7B-GGUF/blob/main/llava-v1.5-7b-mmproj-Q4_0.gguf), [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF)| [`3ab8b3a`](https://github.com/ggerganov/llama.cpp/tree/3ab8b3a92ede46df88bc5a2dfca3777de4a2b2b6) | -| v0.12.0 | LLama3 | [`a743d76`](https://github.com/ggerganov/llama.cpp/tree/a743d76a01f23038b2c85af1e9048ee836767b44) +| v0.12.0 | LLama3 | [`a743d76`](https://github.com/ggerganov/llama.cpp/tree/a743d76a01f23038b2c85af1e9048ee836767b44) | +| v0.13.0 | | [`1debe72`](https://github.com/ggerganov/llama.cpp/tree/1debe72737ea131cb52975da3d53ed3a835df3a6) | ## License diff --git a/llama.cpp b/llama.cpp index a743d76a0..1debe7273 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit a743d76a01f23038b2c85af1e9048ee836767b44 +Subproject commit 1debe72737ea131cb52975da3d53ed3a835df3a6