spiceai · Jeadie · Jan 22, 2025 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/.github/workflows/ci_cuda.yaml b/.github/workflows/ci_cuda.yaml
@@ -9,7 +9,8 @@ jobs:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: nvidia/cuda:12.3.1-devel-ubuntu22.04
       options: --gpus 0 

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -18,9 +18,9 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest] # For now, only test on Linux
-    steps: 
+    steps:
       - name: Checkout repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Install Rust
         uses: actions-rs/toolchain@v1
@@ -65,4 +65,4 @@ jobs:
         working-directory: ./candle-pyo3
         run: |
           source .env/bin/activate
-          python -m pytest -s -v tests
+          python -m pytest -s -v tests
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
@@ -1,6 +1,6 @@
-on: 
+on:
   push:
-    branches: 
+    branches:
       - main
   pull_request:
 
@@ -15,7 +15,7 @@ jobs:
         os: [ubuntu-latest, windows-latest, macOS-latest]
         rust: [stable]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
@@ -34,7 +34,7 @@ jobs:
         os: [ubuntu-latest, windows-latest, macOS-latest]
         rust: [stable]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
@@ -49,7 +49,7 @@ jobs:
     name: Rustfmt
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
@@ -65,7 +65,7 @@ jobs:
     name: Clippy
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal

diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,9 @@ candle-wasm-examples/*/package-lock.json
 candle-wasm-examples/**/config*.json
 .DS_Store
 .idea/*
+__pycache__
+out.safetensors
+out.wav
+bria.mp3
+bria.safetensors
+bria.wav
diff --git a/.vscode/settings.json b/.vscode/settings.json
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,7 +20,7 @@ exclude = [
 resolver = "2"
 
 [workspace.package]
-version = "0.6.0"
+version = "0.8.0"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@@ -33,21 +33,22 @@ ab_glyph = "0.2.23"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
-candle = { path = "./candle-core", package = "candle-core", version = "0.6.0" }
-candle-datasets = { path = "./candle-datasets", version = "0.6.0" }
-candle-flash-attn = { path = "./candle-flash-attn", version = "0.6.0" }
-candle-kernels = { path = "./candle-kernels", version = "0.6.0" }
-candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.6.0" }
-candle-nn = { path = "./candle-nn", version = "0.6.0" }
-candle-onnx = { path = "./candle-onnx", version = "0.6.0" }
-candle-transformers = { path = "./candle-transformers", version = "0.6.0" }
+candle = { path = "./candle-core", package = "candle-core", version = "0.8.0" }
+candle-datasets = { path = "./candle-datasets", version = "0.8.0" }
+candle-flash-attn = { path = "./candle-flash-attn", version = "0.8.0" }
+candle-kernels = { path = "./candle-kernels", version = "0.8.0" }
+candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.8.0" }
+candle-nn = { path = "./candle-nn", version = "0.8.0" }
+candle-onnx = { path = "./candle-onnx", version = "0.8.0" }
+candle-transformers = { path = "./candle-transformers", version = "0.8.0" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "=0.11.6", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
+cudarc = { package = "mistralrs_cudarc_fork", version = "0.12.2", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
 fancy-regex = "0.13.0"
 gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
-hf-hub = "0.3.0"
+hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
+float8 = { version = "0.1.2", features = ["num-traits", "rand_distr"] }
 hound = "3.5.1"
 image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }

diff --git a/README.md b/README.md
@@ -2,7 +2,8 @@
 [![discord server](https://dcbadge.vercel.app/api/server/hugging-face-879548962464493619)](https://discord.gg/hugging-face-879548962464493619)
 [![Latest version](https://img.shields.io/crates/v/candle-core.svg)](https://crates.io/crates/candle-core)
 [![Documentation](https://docs.rs/candle-core/badge.svg)](https://docs.rs/candle-core)
-![License](https://img.shields.io/crates/l/candle-core.svg)
+[![License](https://img.shields.io/github/license/base-org/node?color=blue)](https://github.com/huggingface/candle/blob/main/LICENSE-MIT)
+[![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](https://github.com/huggingface/candle/blob/main/LICENSE-APACHE)
 
 **This is an optimized implmentation by Eric Buehler.**
 
@@ -65,7 +66,9 @@ We also provide a some command line based examples using state of the art models
 - [LLaMA v1, v2, and v3](./candle-examples/examples/llama/): general LLM, includes
   the SOLAR-10.7B variant.
 - [Falcon](./candle-examples/examples/falcon/): general LLM.
-- [Gemma](./candle-examples/examples/gemma/): 2b and 7b general LLMs from Google Deepmind.
+- [Codegeex4](./candle-examples/examples/codegeex4-9b/): Code completion,code interpreter,web search,fuction calling,repository-level
+- [GLM4](./candle-examples/examples/glm4/): Open Multilingual Multimodal Chat LMs by THUDM
+- [Gemma v1 and v2](./candle-examples/examples/gemma/): 2b and 7b+/9b general LLMs from Google Deepmind.
 - [RecurrentGemma](./candle-examples/examples/recurrent-gemma/): 2b and 7b
   Griffin based models from Google that mix attention with a RNN like state.
 - [Phi-1, Phi-1.5, Phi-2, and Phi-3](./candle-examples/examples/phi/): 1.3b,
@@ -120,6 +123,8 @@ We also provide a some command line based examples using state of the art models
   model using residual vector quantization.
 - [MetaVoice](./candle-examples/examples/metavoice/): foundational model for
   text-to-speech.
+- [Parler-TTS](./candle-examples/examples/parler-tts/): large text-to-speech
+  model.
 - [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
   [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
@@ -185,6 +190,7 @@ And then head over to
 - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
 - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
 - [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.
+- [`atoma-infer`](https://github.com/atoma-network/atoma-infer): A Rust library for fast inference at scale, leveraging FlashAttention2 for efficient attention computation, PagedAttention for efficient KV-cache memory management, and multi-GPU support. It is OpenAI api compatible.
 
 If you have an addition to this list, please submit a pull request.
 
@@ -208,7 +214,7 @@ If you have an addition to this list, please submit a pull request.
         - StarCoder, StarCoder2.
         - Phi 1, 1.5, 2, and 3.
         - Mamba, Minimal Mamba
-        - Gemma 2b and 7b.
+        - Gemma v1 2b and 7b+, v2 2b and 9b.
         - Mistral 7b v0.1.
         - Mixtral 8x7b v0.1.
         - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
@@ -236,9 +242,10 @@ If you have an addition to this list, please submit a pull request.
         - Whisper, multi-lingual speech-to-text.
         - EnCodec, audio compression model.
         - MetaVoice-1B, text-to-speech model.
+        - Parler-TTS, text-to-speech model.
     - Computer Vision Models.
         - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
-          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4.
+          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4, Hiera, FastViT.
         - yolo-v3, yolo-v8.
         - Segment-Anything Model (SAM).
         - SegFormer.

diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
@@ -11,8 +11,8 @@ Then let's start by downloading the [model file](https://huggingface.co/bert-bas
 
 ```rust
 # extern crate candle_core;
-# extern crate hf_hub;
-use hf_hub::api::sync::Api;
+# extern crate candle_hf_hub;
+use candle_hf_hub::api::sync::Api;
 use candle_core::Device;
 
 let api = Api::new().unwrap();
@@ -50,8 +50,8 @@ Now that we have our weights, we can use them in our bert architecture:
 ```rust
 # extern crate candle_core;
 # extern crate candle_nn;
-# extern crate hf_hub;
-# use hf_hub::api::sync::Api;
+# extern crate candle_hf_hub;
+# use candle_hf_hub::api::sync::Api;
 # 
 # let api = Api::new().unwrap();
 # let repo = api.model("bert-base-uncased".to_string());

diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
@@ -18,6 +18,7 @@ metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
+float8 = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 libc = { workspace = true, optional = true }
 memmap2 = { workspace = true }
@@ -39,7 +40,7 @@ criterion = { workspace = true }
 
 [features]
 default = []
-cuda = ["cudarc", "dep:candle-kernels"]
+cuda = ["cudarc", "dep:candle-kernels", "float8/mistralrs_cudarc_fork"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]

diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs
@@ -20,13 +20,16 @@ impl BenchDevice for Device {
             Device::Cpu => Ok(()),
             Device::Cuda(device) => {
                 #[cfg(feature = "cuda")]
-                return Ok(device.synchronize()?);
+                {
+                    use cuda::WrapErr;
+                    return Ok(device.synchronize().w()?);
+                }
                 #[cfg(not(feature = "cuda"))]
                 panic!("Cuda device without cuda feature enabled: {:?}", device)
             }
             Device::Metal(device) => {
                 #[cfg(feature = "metal")]
-                return Ok(device.wait_until_completed()?);
+                return device.wait_until_completed();
                 #[cfg(not(feature = "metal"))]
                 panic!("Metal device without metal feature enabled: {:?}", device)
             }

diff --git a/candle-core/src/backend.rs b/candle-core/src/backend.rs
@@ -89,9 +89,23 @@ pub trait BackendStorage: Sized {
         _: usize,
     ) -> Result<Self>;
 
-    fn matmul(
+    #[allow(clippy::too_many_arguments)]
+    fn matmul_with_alpha_beta(
+        &self,
+        _: &Self,
+        _: &mut Self,
+        _: Option<f64>,
+        _: (usize, usize, usize, usize),
+        _: &Layout,
+        _: &Layout,
+        _: &Layout,
+    ) -> Result<()>;
+
+    #[allow(clippy::too_many_arguments)]
+    fn matmul_with_alpha(
         &self,
         _: &Self,
+        _: Option<f64>,
         _: (usize, usize, usize, usize),
         _: &Layout,
         _: &Layout,
@@ -144,6 +158,7 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
     fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
 
     fn set_seed(&self, _: u64) -> Result<()>;
+    fn get_current_seed(&self) -> Result<u64>;
 
     /// Synchronize should block until all the operations on the device are completed.
     fn synchronize(&self) -> Result<()>;

diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs
@@ -623,9 +623,9 @@ impl Tensor {
                     }
                     Op::Unary(arg, UnaryOp::Silu) => {
                         let sum_grad = grads.or_insert(arg)?;
-                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
+                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x))) = sigmoid(x) * (1 - node) + node
                         let sigmoid_arg = (arg.neg()?.exp()? + 1.)?.recip()?;
-                        let silu_grad = (&sigmoid_arg * (1. + (arg * (1. - &sigmoid_arg)?)?)?)?;
+                        let silu_grad = &sigmoid_arg * (1. - *node) + *node;
                         *sum_grad = sum_grad.add(&(&grad * silu_grad)?)?
                     }
                     Op::Elu(arg, alpha) => {
@@ -756,4 +756,9 @@ impl GradStore {
         };
         Ok(grad)
     }
+
+    /// Get the tensor ids of the stored gradient tensors
+    pub fn get_ids(&self) -> impl Iterator<Item = &TensorId> {
+        self.0.keys()
+    }
 }
diff --git a/candle-core/src/convert.rs b/candle-core/src/convert.rs
@@ -1,5 +1,6 @@
 //! Implement conversion traits for tensors
 use crate::{DType, Device, Error, Tensor, WithDType};
+use float8::F8E4M3;
 use half::{bf16, f16, slice::HalfFloatSliceExt};
 use std::convert::TryFrom;
 
@@ -130,6 +131,16 @@ impl Tensor {
                     f.write_u32::<LittleEndian>(v)?
                 }
             }
+            DType::I16 => {
+                for v in vs.to_vec1::<i16>()? {
+                    f.write_i16::<LittleEndian>(v)?
+                }
+            }
+            DType::I32 => {
+                for v in vs.to_vec1::<i32>()? {
+                    f.write_i32::<LittleEndian>(v)?
+                }
+            }
             DType::I64 => {
                 for v in vs.to_vec1::<i64>()? {
                     f.write_i64::<LittleEndian>(v)?
@@ -139,6 +150,11 @@ impl Tensor {
                 let vs = vs.to_vec1::<u8>()?;
                 f.write_all(&vs)?;
             }
+            DType::F8E4M3 => {
+                for v in vs.to_vec1::<F8E4M3>()? {
+                    f.write_u8(v.to_bits())?
+                }
+            }
         }
         Ok(())
     }