SciSharp · martindevans · Jun 3, 2024 · May 24, 2024 · Jun 2, 2024 · Jun 3, 2024
diff --git a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -103,7 +103,7 @@ public void GlobalSetup()
         {
             var showLLamaCppLogs = true;
             NativeLibraryConfig
-               .Instance
+               .All
                .WithLogCallback((level, message) =>
                {
                    if (showLLamaCppLogs)

diff --git a/LLama.Experimental/LLama.Experimental.csproj b/LLama.Experimental/LLama.Experimental.csproj
@@ -7,7 +7,7 @@
     <langversion>12</langversion>
     <RootNamespace>LLama</RootNamespace>
 
-    <Version>0.12.0</Version>
+    <Version>0.13.0</Version>
     <Authors>Rinne</Authors>
     <Company>SciSharp STACK</Company>
     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>

diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -4,7 +4,7 @@
     <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
-    <Version>0.12.0</Version>
+    <Version>0.13.0</Version>
     <Authors>Xbotter</Authors>
     <Company>SciSharp STACK</Company>
     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>

diff --git a/LLama.KernelMemory/LlamaSharpConfig.cs b/LLama.KernelMemory/LlamaSharpConfig.cs
@@ -1,10 +1,5 @@
-using LLama.Common;
+using LLama.Common;
 using LLama.Native;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
 
 namespace LLamaSharp.KernelMemory
 {

diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -1,4 +1,4 @@
-using LLama;
+using LLama;
 using LLama.Common;
 using LLama.Native;
 using Microsoft.KernelMemory.AI;

diff --git a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
@@ -10,7 +10,7 @@
 		<ImplicitUsings>enable</ImplicitUsings>
 		<Nullable>enable</Nullable>
 
-		<Version>0.12.0</Version>
+		<Version>0.13.0</Version>
 		<Authors>Tim Miller, Xbotter</Authors>
 		<Company>SciSharp STACK</Company>
 		<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@@ -45,7 +45,7 @@
 		<ProjectReference Include="..\LLama\LLamaSharp.csproj" />
 	</ItemGroup>
 	<ItemGroup>
-		<InternalsVisibleTo Include="Llama.Unittest"/>
+		<InternalsVisibleTo Include="Llama.Unittest" />
 	</ItemGroup>
 
 </Project>
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -116,7 +116,6 @@ public interface IContextParams
     /// <summary>
     /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
     /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
-
     /// </summary>
     float? DefragThreshold { get; }
 

diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -1,4 +1,4 @@
-using System;
+using System;
 using System.Buffers;
 using System.Collections;
 using System.Collections.Generic;

diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
@@ -9,7 +9,6 @@
 using LLama.Common;
 using System.Runtime.InteropServices;
 using System.Threading.Tasks;
-using LLama.Extensions;
 using LLama.Abstractions;
 using LLama.Sampling;
 using Microsoft.Extensions.Logging;
@@ -56,33 +55,22 @@ public sealed class LLamaContext
         /// </summary>
         public Encoding Encoding { get; }
 
-        private uint _generationThreads;
-        private uint _batchThreads;
-
         /// <summary>
         /// Get or set the number of threads to use for generation
         /// </summary>
         public uint GenerationThreads
         {
-            get => _generationThreads;
-            set
-            {
-                _generationThreads = value;
-                NativeHandle.SetThreads(_generationThreads, _batchThreads);
-            }
+            get => NativeHandle.GenerationThreads;
+            set => NativeHandle.GenerationThreads = value;
         }
 
         /// <summary>
         /// Get or set the number of threads to use for batch processing
         /// </summary>
         public uint BatchThreads
         {
-            get => _batchThreads;
-            set
-            {
-                _batchThreads = value;
-                NativeHandle.SetThreads(_generationThreads, _batchThreads);
-            }
+            get => NativeHandle.BatchThreads;
+            set => NativeHandle.BatchThreads = value;
         }
 
         /// <summary>
@@ -111,10 +99,6 @@ public LLamaContext(LLamaWeights model, IContextParams @params, ILogger? logger
 
             @params.ToLlamaContextParams(out var lparams);
             NativeHandle = SafeLLamaContextHandle.Create(model.NativeHandle, lparams);
-
-            // It's not possible to get these values from llama.cpp, store a copy of them here.
-            _generationThreads = lparams.n_threads;
-            _batchThreads = lparams.n_threads_batch;
         }
 
         /// <summary>

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -7,7 +7,7 @@
     <Platforms>AnyCPU;x64;Arm64</Platforms>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
 
-    <Version>0.12.0</Version>
+    <Version>0.13.0</Version>
     <Authors>Rinne, Martin Evans, jlsantiago and all the other contributors in https://github.com/SciSharp/LLamaSharp/graphs/contributors.</Authors>
     <Company>SciSharp STACK</Company>
     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>

diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
@@ -1,4 +1,4 @@
-namespace LLama.Native;
+namespace LLama.Native;
 
 /// <summary>
 /// 

diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -1,4 +1,4 @@
-using System;
+using System;
 using System.Runtime.InteropServices;
 
 namespace LLama.Native
@@ -27,7 +27,12 @@ public unsafe struct LLamaModelParams
         /// <summary>
         /// how to split layers across multiple GPUs (size: <see cref="NativeApi.llama_max_devices"/>)
         /// </summary>
-        public float* tensor_split;
+        public float* tensor_split;
+
+        /// <summary>
+        /// comma separated list of RPC servers to use for offloading
+        /// </summary>
+        public byte* rpc_servers;
 
         /// <summary>
         /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback

diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
@@ -1,17 +1,24 @@
-namespace LLama.Native;
+namespace LLama.Native;
 
 /// <summary>
 /// 
 /// </summary>
 /// <remarks>llama_vocab_pre_type</remarks>
 internal enum LLamaVocabPreType
 {
-    LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-    LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
-    LLAMA_VOCAB_PRE_TYPE_MPT = 5,
-    LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
-    LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
+    Default = 0,
+
+    LLAMA3 = 1,
+    DEEPSEEK_LLM = 2,
+    DEEPSEEK_CODER = 3,
+    FALCON = 4,
+    MPT = 5,
+    STARCODER = 6,
+    GPT2 = 7,
+    REFACT = 8,
+    COMMAND_R = 9,
+    STABLELM2 = 10,
+    QWEN2 = 11,
+    OLMO = 12,
+    DBRX = 13,
 }
diff --git a/LLama/Native/Load/NativeLibraryMetadata.cs b/LLama/Native/Load/NativeLibraryMetadata.cs
@@ -1,4 +1,4 @@
-
+
 namespace LLama.Native
 {
     /// <summary>

diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -41,6 +41,24 @@ public sealed class SafeLLamaContextHandle
         /// </summary>
         public uint UBatchSize => llama_n_ubatch(this);
 
+        /// <summary>
+        /// Get or set the number of threads used for generation of a single token.
+        /// </summary>
+        public uint GenerationThreads
+        {
+            get => llama_n_threads(this);
+            set => llama_set_n_threads(this, value, BatchThreads);
+        }
+
+        /// <summary>
+        /// Get or set the number of threads used for prompt and batch processing (multiple token).
+        /// </summary>
+        public uint BatchThreads
+        {
+            get => llama_n_threads_batch(this);
+            set => llama_set_n_threads(this, GenerationThreads, value);
+        }
+
         /// <summary>
         /// Get the model which this context is using
         /// </summary>
@@ -157,6 +175,22 @@ static SafeLLamaContextHandle()
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern void llama_set_n_threads(SafeLLamaContextHandle ctx, uint n_threads, uint n_threads_batch);
 
+        /// <summary>
+        /// Get the number of threads used for generation of a single token.
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern uint llama_n_threads(SafeLLamaContextHandle ctx);
+
+        /// <summary>
+        /// Get the number of threads used for prompt and batch processing (multiple token).
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern uint llama_n_threads_batch(SafeLLamaContextHandle ctx);
+
         /// <summary>
         /// Token logits obtained from the last call to llama_decode
         /// The logits for the last token are stored in the last row
@@ -538,6 +572,7 @@ public void SetSeed(uint seed)
         /// </summary>
         /// <param name="threads">n_threads is the number of threads used for generation (single token)</param>
         /// <param name="threadsBatch">n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)</param>
+        [Obsolete("Use `GenerationThreads` and `BatchThreads` properties")]
         public void SetThreads(uint threads, uint threadsBatch)
         {
             llama_set_n_threads(this, threads, threadsBatch);
@@ -613,7 +648,7 @@ public int KvCacheCountTokens()
         }
 
         /// <summary>
-        /// Clear the KV cache
+        /// Clear the KV cache - both cell info is erased and KV data is zeroed
         /// </summary>
         public void KvCacheClear()
         {

diff --git a/LLama/Native/llama_vocab_pre_type.cs b/LLama/Native/llama_vocab_pre_type.cs