From 26bddf58420f40b7f9e7d67f390d527e56c4610d Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Sun, 25 Feb 2024 17:28:03 +0100 Subject: [PATCH 1/3] non-temporal stores: use inline assembly --- crates/core_arch/src/x86/avx.rs | 18 +++++++++++++++--- crates/core_arch/src/x86/avx512f.rs | 18 +++++++++++++++--- crates/core_arch/src/x86/mod.rs | 2 +- crates/core_arch/src/x86/sse.rs | 6 +++++- crates/core_arch/src/x86/sse2.rs | 22 +++++++++++++++++----- crates/core_arch/src/x86_64/sse2.rs | 11 ++++++----- 6 files changed, 59 insertions(+), 18 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index fba2901601..3108aa4d4a 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1718,7 +1718,11 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(ymm_reg) a, + ); } /// Moves double-precision values from a 256-bit vector of `[4 x double]` @@ -1741,7 +1745,11 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { - intrinsics::nontemporal_store(mem_addr as *mut __m256d, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(ymm_reg) a, + ); } /// Moves single-precision floating point values from a 256-bit vector @@ -1765,7 +1773,11 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { - intrinsics::nontemporal_store(mem_addr as *mut __m256, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(ymm_reg) a, + ); } /// Computes the approximate reciprocal of packed single-precision (32-bit) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a37cda1c51..e714af50f1 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -28014,7 +28014,11 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> #[cfg_attr(test, assert_instr(vmovntps))] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { - intrinsics::nontemporal_store(mem_addr as *mut __m512, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(zmm_reg) a, + ); } /// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. @@ -28035,7 +28039,11 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { #[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { - intrinsics::nontemporal_store(mem_addr as *mut __m512d, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(zmm_reg) a, + ); } /// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. @@ -28056,7 +28064,11 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { #[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) { - intrinsics::nontemporal_store(mem_addr as *mut __m512i, a); + crate::arch::asm!( + "vmovntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(zmm_reg) a, + ); } /// Sets packed 32-bit integers in `dst` with the supplied values. diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index e04b7910d2..e30816b508 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -2,7 +2,7 @@ #[allow(unused_imports)] use crate::marker::Sized; -use crate::{intrinsics, mem::transmute}; +use crate::mem::transmute; #[macro_use] mod macros; diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index bee8291dcc..ca560dc779 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -2002,7 +2002,11 @@ extern "C" { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { - intrinsics::nontemporal_store(mem_addr as *mut __m128, a); + crate::arch::asm!( + "movntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(xmm_reg) a, + ); } #[cfg(test)] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 70750e371d..320f26b3f4 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1327,11 +1327,15 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { /// /// See [`_mm_sfence`] for details. #[inline] -#[target_feature(enable = "sse2")] +#[target_feature(enable = "sse,sse2")] #[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "movntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(xmm_reg) a, + ); } /// Stores a 32-bit integer value in the specified memory location. @@ -1353,7 +1357,11 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { #[cfg_attr(test, assert_instr(movnti))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value + mem_addr = in(reg) mem_addr, + a = in(reg) a, + ); } /// Returns a vector where the low element is extracted from `a` and its upper @@ -2543,12 +2551,16 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { /// /// See [`_mm_sfence`] for details. #[inline] -#[target_feature(enable = "sse2")] +#[target_feature(enable = "sse,sse2")] #[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { - intrinsics::nontemporal_store(mem_addr as *mut __m128d, a); + crate::arch::asm!( + "movntps [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(xmm_reg) a, + ); } /// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs index ea3dbc41bc..55d856afec 100644 --- a/crates/core_arch/src/x86_64/sse2.rs +++ b/crates/core_arch/src/x86_64/sse2.rs @@ -1,9 +1,6 @@ //! `x86_64`'s Streaming SIMD Extensions 2 (SSE2) -use crate::{ - core_arch::x86::*, - intrinsics::{self, simd::*}, -}; +use crate::{core_arch::x86::*, intrinsics::simd::*}; #[cfg(test)] use stdarch_test::assert_instr; @@ -81,7 +78,11 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 { #[cfg_attr(test, assert_instr(movnti))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { - intrinsics::nontemporal_store(mem_addr, a); + crate::arch::asm!( + "movnti [{mem_addr}], {a}", + mem_addr = in(reg) mem_addr, + a = in(reg) a, + ); } /// Returns a vector whose lowest element is `a` and all higher elements are From bf0d02404302e0e6a00ef1fa058452700dbd7c86 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Wed, 28 Feb 2024 18:32:05 +0100 Subject: [PATCH 2/3] fix test_mm512_stream_ps test --- crates/core_arch/src/x86/avx512f.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index e714af50f1..252f826e35 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -54385,9 +54385,9 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_stream_ps() { - #[repr(align(32))] + #[repr(align(64))] struct Memory { - pub data: [f32; 16], + pub data: [f32; 16], // 64 bytes } let a = _mm512_set1_ps(7.0); let mut mem = Memory { data: [-1.0; 16] }; From 3e10d094e4659398fea9b97a059856baf2862b6a Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Fri, 21 Jun 2024 16:36:47 +0200 Subject: [PATCH 3/3] set asm attributes --- crates/core_arch/src/x86/avx.rs | 3 +++ crates/core_arch/src/x86/avx512f.rs | 3 +++ crates/core_arch/src/x86/sse.rs | 1 + crates/core_arch/src/x86/sse2.rs | 3 +++ crates/core_arch/src/x86_64/sse2.rs | 1 + 5 files changed, 11 insertions(+) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 3108aa4d4a..82fe0acd23 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1722,6 +1722,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { "vmovntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(ymm_reg) a, + options(nostack, preserves_flags), ); } @@ -1749,6 +1750,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { "vmovntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(ymm_reg) a, + options(nostack, preserves_flags), ); } @@ -1777,6 +1779,7 @@ pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { "vmovntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(ymm_reg) a, + options(nostack, preserves_flags), ); } diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 252f826e35..886c533d79 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -28018,6 +28018,7 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { "vmovntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(zmm_reg) a, + options(nostack, preserves_flags), ); } @@ -28043,6 +28044,7 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { "vmovntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(zmm_reg) a, + options(nostack, preserves_flags), ); } @@ -28068,6 +28070,7 @@ pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) { "vmovntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(zmm_reg) a, + options(nostack, preserves_flags), ); } diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index ca560dc779..ecba71fb58 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -2006,6 +2006,7 @@ pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { "movntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(xmm_reg) a, + options(nostack, preserves_flags), ); } diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 320f26b3f4..639ea78894 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1335,6 +1335,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { "movntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(xmm_reg) a, + options(nostack, preserves_flags), ); } @@ -1361,6 +1362,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { "movnti [{mem_addr}], {a:e}", // `:e` for 32bit value mem_addr = in(reg) mem_addr, a = in(reg) a, + options(nostack, preserves_flags), ); } @@ -2560,6 +2562,7 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { "movntps [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(xmm_reg) a, + options(nostack, preserves_flags), ); } diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs index 55d856afec..22788b275a 100644 --- a/crates/core_arch/src/x86_64/sse2.rs +++ b/crates/core_arch/src/x86_64/sse2.rs @@ -82,6 +82,7 @@ pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { "movnti [{mem_addr}], {a}", mem_addr = in(reg) mem_addr, a = in(reg) a, + options(nostack, preserves_flags), ); }