From 5c4c351a1a0729f8e7a88e6895379bb952ed1a07 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 18 Aug 2024 11:16:13 -0700 Subject: [PATCH] make xof_many fall back to compress_xof instead of portable code --- .github/workflows/ci.yml | 20 ++++++++++++++++++++ c/blake3_dispatch.c | 4 +++- c/blake3_impl.h | 3 +++ src/platform.rs | 13 +++++++++++-- src/portable.rs | 4 ++++ 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7c5de48c..b031602b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -126,6 +126,26 @@ jobs: - name: cargo test C bindings intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./c/blake3_c_rust_bindings + - name: cargo test C bindings no AVX-512 + run: cargo test + working-directory: ./c/blake3_c_rust_bindings + env: + CFLAGS: -DBLAKE3_NO_AVX512 + - name: cargo test C bindings no AVX2 + run: cargo test + working-directory: ./c/blake3_c_rust_bindings + env: + CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 + - name: cargo test C bindings no SSE41 + run: cargo test + working-directory: ./c/blake3_c_rust_bindings + env: + CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 + - name: cargo test C bindings no SSE2 + run: cargo test + working-directory: ./c/blake3_c_rust_bindings + env: + CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_SSE2 # Reference impl doc test. - name: reference impl doc test run: cargo test diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index cf5bad7ee..5c76b14b6 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -241,7 +241,9 @@ void blake3_xof_many(const uint32_t cv[8], } #endif #endif - blake3_xof_many_portable(cv, block, block_len, counter, flags, out, outblocks); + for(size_t i = 0; i < outblocks; ++i) { + blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i); + } } void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, diff --git a/c/blake3_impl.h b/c/blake3_impl.h index abd754677..3da773b95 100644 --- a/c/blake3_impl.h +++ b/c/blake3_impl.h @@ -222,6 +222,9 @@ void blake3_compress_xof_portable(const uint32_t cv[8], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); +// This function is test-only. When blake3_xof_many doesn't have an optimized implementation, +// it loops over blake3_compress_xof instead of falling back to this, so it still benefits +// from compress optimizations. void blake3_xof_many_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, diff --git a/src/platform.rs b/src/platform.rs index 590a77cc2..cd8ef63d2 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -282,7 +282,7 @@ impl Platform { cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - counter: u64, + mut counter: u64, flags: u8, out: &mut [u8], ) { @@ -299,7 +299,16 @@ impl Platform { Platform::AVX512 => unsafe { crate::avx512::xof_many(cv, block, block_len, counter, flags, out) }, - _ => crate::portable::xof_many(cv, block, block_len, counter, flags, out), + _ => { + // For platforms without an optimized xof_many, fall back to a loop over + // compress_xof. This is still faster than portable code. + for out_block in out.chunks_exact_mut(BLOCK_LEN) { + // TODO: Use array_chunks_mut here once that's stable. + let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap(); + *out_array = self.compress_xof(cv, block, block_len, counter, flags); + counter += 1; + } + } } } diff --git a/src/portable.rs b/src/portable.rs index 4181f279b..35b5f5d44 100644 --- a/src/portable.rs +++ b/src/portable.rs @@ -177,6 +177,10 @@ pub fn hash_many( } } +// This function is test-only. When platform::xof_many() doesn't have an optimized implementation, +// it loops over platform::compress_xof() instead of falling back to this, so it still benefits +// from compress optimizations. +#[cfg(test)] pub fn xof_many( cv: &CVWords, block: &[u8; BLOCK_LEN],