spiceai · Jeadie · Jan 22, 2025 · Jan 22, 2025 · Jan 23, 2025 · Jan 28, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,8 +7,8 @@ members = [
     "mistralrs-bench",
     "mistralrs-vision",
     "mistralrs-quant",
-    "mistralrs-paged-attn",
 ]
+exclude = ["mistralrs-paged_attn"]
 resolver = "2"
 
 [workspace.package]
@@ -23,9 +23,10 @@ license = "MIT"
 rust-version = "1.82"
 
 [workspace.dependencies]
+
+candle-core = { git = "https://github.com/spiceai/candle.git", version = "0.8.0", rev = "fd28f08dfd918e212231806bedb48bdd7d714976" }
 anyhow = "1.0.80"
-candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "3388a3c" }
-candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "3388a3c" }
+candle-nn = { git = "https://github.com/spiceai/candle.git", version = "0.8.0", rev = "fd28f08dfd918e212231806bedb48bdd7d714976" }
 serde = "1.0.197"
 serde_json = "1.0.114"
 indexmap = { version = "2.2.5", features = ["serde"] }
@@ -40,7 +41,24 @@ pyo3 = { version = "0.22.4", features = ["full", "extension-module", "either"] }
 tokio = { version = "1.36.0", features = ["full", "rt-multi-thread"] }
 once_cell = "1.19.0"
 # All features but avif, avif increases the msrv dramatically
-image = { version = "0.25.1", default-features = false, features = ['bmp', 'dds', 'exr', 'ff', 'gif', 'hdr', 'ico', 'jpeg', 'png', 'pnm', 'qoi', 'tga', 'tiff', 'webp']}
+
+buildstructor = "0.5.4"
+image = { version = "0.25.1", default-features = false, features = [
+    'bmp',
+    'dds',
+    'exr',
+    'ff',
+    'gif',
+    'hdr',
+    'ico',
+    'jpeg',
+    'png',
+    'pnm',
+    'qoi',
+    'tga',
+    'tiff',
+    'webp',
+] }
 reqwest = { version = "0.12.4", features = ["blocking"] }
 base64 = "0.22.1"
 half = "2.4.0"

diff --git a/docs/UQFF/LAYOUT.md b/docs/UQFF/LAYOUT.md
@@ -1,6 +1,6 @@
 # UQFF internal structure
 
-The following describes the exact memory layout of HQFF tensors of version 0.1.0.
+The following describes the exact memory layout of UQFF tensors of version 0.1.0.
 
 ## ToC
 - [GGUF quantization](#gguf-quantization)
@@ -14,7 +14,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (0) | u8 | little endian  |
 | Tensor data length in bytes | u32 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
@@ -27,7 +27,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## Unquantized layers
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (1) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Weight tensor data, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors)  |
@@ -36,7 +36,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## FP8 layers
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (1) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Weight tensor data, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors)  |
@@ -49,7 +49,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## HQQ quantization
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (2) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Q weight, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors) |
@@ -67,7 +67,7 @@ The following describes the exact memory layout of HQFF tensors of version 0.1.0
 ## FP8 layers
 | ID | Element type | Endianness |
 | -------- | -------- | -------- |
-| HQFF version | u32 | little endian  |
+| UQFF version | u32 | little endian  |
 | ISQ type (3) | u8 | little endian  |
 | Whether bias data is included (boolean) | u8 | little endian  |
 | **Array** Weight tensor data, see [docs](#standard-tensors) | See [docs](#standard-tensors) | See [docs](#standard-tensors)  | 

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 candle-nn.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-candle-flash-attn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.8.0", rev = "3388a3c", optional = true }
+candle-flash-attn = { git = "https://github.com/spiceai/candle.git", version = "0.8.0", rev = "fd28f08dfd918e212231806bedb48bdd7d714976", optional = true }
 dirs = "5.0.1"
 hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
 thiserror = "1.0.57"
@@ -71,14 +71,17 @@ bytemuck_derive = "1.7.0"
 mistralrs-paged-attn = { version = "0.4.0", path = "../mistralrs-paged-attn", optional = true }
 mistralrs-quant = { version = "0.4.0", path = "../mistralrs-quant" }
 uuid = { version = "1.10.0", features = ["v4"] }
+ureq = "2.10"
 schemars = "0.8.21"
 serde_yaml = "0.9.34"
 regex.workspace = true
 safetensors = "0.4.5"
 serde_plain = "1.0.2"
 as-any = "0.3.1"
 float8.workspace = true
-llguidance = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc", default-features = false, features = ["lark"] }
+llguidance = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc", default-features = false, features = [
+    "lark",
+] }
 toktrie_hf_tokenizers = { git = "https://github.com/microsoft/llguidance", rev = "cfef3df97372a7b84d74976ff41cc9cb78bca6cc" }
 objc = { version = "0.2.7", optional = true }
 metal = { workspace = true, optional = true }
@@ -87,11 +90,31 @@ memmap2 = "0.9.5"
 
 [features]
 pyo3_macros = ["pyo3"]
-cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda", "mistralrs-quant/cuda", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/cuda", "float8/mistralrs_cudarc_fork"]
+cuda = [
+    "candle-core/cuda",
+    "candle-nn/cuda",
+    "dep:bindgen_cuda",
+    "mistralrs-quant/cuda",
+    "dep:mistralrs-paged-attn",
+    "mistralrs-paged-attn/cuda",
+    "float8/mistralrs_cudarc_fork",
+]
 cudnn = ["candle-core/cudnn"]
-metal = ["candle-core/metal", "candle-nn/metal", "mistralrs-quant/metal", "dep:objc", "dep:mistralrs-paged-attn", "mistralrs-paged-attn/metal", "dep:metal"]
+metal = [
+    "candle-core/metal",
+    "candle-nn/metal",
+    "mistralrs-quant/metal",
+    "dep:objc",
+    "dep:mistralrs-paged-attn",
+    "mistralrs-paged-attn/metal",
+    "dep:metal",
+]
 flash-attn = ["cuda", "dep:candle-flash-attn"]
-accelerate = ["candle-core/accelerate", "candle-nn/accelerate", "mistralrs-quant/accelerate"]
+accelerate = [
+    "candle-core/accelerate",
+    "candle-nn/accelerate",
+    "mistralrs-quant/accelerate",
+]
 mkl = ["candle-core/mkl", "candle-nn/mkl"]
 
 [build-dependencies]

diff --git a/mistralrs-core/build.rs b/mistralrs-core/build.rs
@@ -1,6 +1,8 @@
 #[cfg(feature = "cuda")]
 const CUDA_NVCC_FLAGS: Option<&'static str> = option_env!("CUDA_NVCC_FLAGS");
 
+const SUPPORTS_ATTN_SOFTMAX_FILE: &str = "src/utils/supports_attn_softmax.rs";
+
 fn main() {
     #[cfg(feature = "cuda")]
     {
@@ -59,4 +61,90 @@ fn main() {
             println!("cargo:rustc-link-lib=dylib=stdc++");
         }
     }
+
+    #[cfg(feature = "metal")]
+    {
+        use std::fs::OpenOptions;
+        use std::io::Write;
+        use std::process::{Command, Stdio};
+
+        // echo "__METAL_VERSION__" | xcrun -sdk macosx metal -E -x metal -P -
+
+        // Create the `echo` command and pipe its output into `xcrun`
+        let mut echo = Command::new("echo")
+            .arg("__METAL_VERSION__")
+            .stdout(Stdio::piped())
+            .spawn()
+            .expect("Failed to start echo command");
+
+        echo.wait().unwrap();
+
+        // Run the `xcrun` command, taking input from the `echo` command's output
+        let output = Command::new("xcrun")
+            .arg("-sdk")
+            .arg("macosx")
+            .arg("metal")
+            .arg("-E")
+            .arg("-x")
+            .arg("metal")
+            .arg("-P")
+            .arg("-")
+            .stdin(echo.stdout.unwrap())
+            .output()
+            .expect("Failed to run xcrun command");
+
+        // Handle the output
+        let supports_attn_softmax = if output.status.success() {
+            let version = String::from_utf8_lossy(&output.stdout)
+                .split('\n')
+                .nth(1)
+                .unwrap()
+                .trim()
+                .to_string()
+                .parse::<usize>()
+                .unwrap();
+            // Attn softmax is only supported for metal >= 310 because of the vectorized bfloat types
+            version >= 310
+        } else {
+            // Default to false if anything goes wrong
+            false
+        };
+
+        let mut file = OpenOptions::new()
+            .write(true)
+            .open(SUPPORTS_ATTN_SOFTMAX_FILE)
+            .unwrap();
+
+        // Add the other stuff back
+        if let Err(e) = writeln!(
+            file,
+            "pub(crate) const SUPPORTS_ATTN_SOFTMAX: bool = {supports_attn_softmax};"
+        ) {
+            panic!(
+                "Error writing src/utils/supports_attn_softmax.rs: {:?}\n",
+                e
+            )
+        }
+    }
+
+    #[cfg(not(feature = "metal"))]
+    {
+        use std::fs::OpenOptions;
+        use std::io::Write;
+        let mut file = OpenOptions::new()
+            .write(true)
+            .open(SUPPORTS_ATTN_SOFTMAX_FILE)
+            .unwrap();
+
+        // Add the other stuff back
+        if let Err(e) = writeln!(
+            file,
+            "pub(crate) const SUPPORTS_ATTN_SOFTMAX: bool = false;"
+        ) {
+            panic!(
+                "Error writing src/utils/supports_attn_softmax.rs: {:?}\n",
+                e
+            )
+        }
+    }
 }