SciSharp · martindevans · Mar 13, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -192,6 +192,18 @@ public uint TokenToSpan(LLamaToken token, Span<byte> dest)
         #endregion
 
         #region infer
+        /// <summary>
+        /// This object exists to ensure there is only ever 1 inference running at a time. This is a workaround for thread safety issues in llama.cpp itself.
+        /// Most notably CUDA, which seems to use some global singleton resources and will crash if multiple inferences are run (even against different models).
+        /// 
+        /// For more information see these issues:
+        ///  - https://github.com/SciSharp/LLamaSharp/issues/596
+        ///  - https://github.com/ggerganov/llama.cpp/issues/3960
+        ///
+        /// If these are ever resolved this lock can probably be removed.
+        /// </summary>
+        private static readonly object GlobalInferenceLock = new();
+
         /// <summary>
         /// </summary>
         /// <param name="batch"></param>
@@ -202,8 +214,9 @@ public uint TokenToSpan(LLamaToken token, Span<byte> dest)
         /// </returns>
         public DecodeResult Decode(LLamaBatch batch)
         {
-            using (batch.ToNativeBatch(out var nb))
-                return (DecodeResult)NativeApi.llama_decode(this, nb);
+            lock (GlobalInferenceLock)
+                using (batch.ToNativeBatch(out var nb))
+                    return (DecodeResult)NativeApi.llama_decode(this, nb);
         }
 
         /// <summary>