From 9684ebf1bf80b9eb8aa2924da6b9748fb96d6750 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Sat, 11 Sep 2021 16:36:35 +0100 Subject: [PATCH 1/5] fill_via_chunks: use safe code via chunks_exact_mut on BE This (specifically, using chunks_exact_mut) actually improves performance substantially. --- rand_core/src/impls.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs index 2588a72ea3f..f16f5b5b8d2 100644 --- a/rand_core/src/impls.rs +++ b/rand_core/src/impls.rs @@ -58,9 +58,8 @@ macro_rules! fill_via_chunks { let chunk_size_u8 = min($src.len() * SIZE, $dst.len()); let chunk_size = (chunk_size_u8 + SIZE - 1) / SIZE; - // The following can be replaced with safe code, but unfortunately it's - // ca. 8% slower. if cfg!(target_endian = "little") { + // On LE we can do a simple copy, which is 25-50% faster: unsafe { core::ptr::copy_nonoverlapping( $src.as_ptr() as *const u8, @@ -68,15 +67,16 @@ macro_rules! fill_via_chunks { chunk_size_u8); } } else { - for (&n, chunk) in $src.iter().zip($dst.chunks_mut(SIZE)) { - let tmp = n.to_le(); - let src_ptr = &tmp as *const $ty as *const u8; - unsafe { - core::ptr::copy_nonoverlapping( - src_ptr, - chunk.as_mut_ptr(), - chunk.len()); - } + // This code is valid on all arches, but slower than the above: + let mut i = 0; + let mut iter = $dst[..chunk_size_u8].chunks_exact_mut(SIZE); + while let Some(chunk) = iter.next() { + chunk.copy_from_slice(&$src[i].to_le_bytes()); + i += 1; + } + let chunk = iter.into_remainder(); + if !chunk.is_empty() { + chunk.copy_from_slice(&$src[i].to_le_bytes()[..chunk.len()]); } } From 9972046a118844b9d9ea759e292de08f26761b7d Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 13 Sep 2021 09:30:14 +0100 Subject: [PATCH 2/5] fill_via_chunks: make a generic function --- rand_core/src/impls.rs | 76 +++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs index f16f5b5b8d2..bd21b7c731e 100644 --- a/rand_core/src/impls.rs +++ b/rand_core/src/impls.rs @@ -52,36 +52,52 @@ pub fn fill_bytes_via_next(rng: &mut R, dest: &mut [u8]) { } } -macro_rules! fill_via_chunks { - ($src:expr, $dst:expr, $ty:ty) => {{ - const SIZE: usize = core::mem::size_of::<$ty>(); - let chunk_size_u8 = min($src.len() * SIZE, $dst.len()); - let chunk_size = (chunk_size_u8 + SIZE - 1) / SIZE; - - if cfg!(target_endian = "little") { - // On LE we can do a simple copy, which is 25-50% faster: - unsafe { - core::ptr::copy_nonoverlapping( - $src.as_ptr() as *const u8, - $dst.as_mut_ptr(), - chunk_size_u8); - } - } else { - // This code is valid on all arches, but slower than the above: - let mut i = 0; - let mut iter = $dst[..chunk_size_u8].chunks_exact_mut(SIZE); - while let Some(chunk) = iter.next() { - chunk.copy_from_slice(&$src[i].to_le_bytes()); - i += 1; - } - let chunk = iter.into_remainder(); - if !chunk.is_empty() { - chunk.copy_from_slice(&$src[i].to_le_bytes()[..chunk.len()]); - } +trait ToLe: Copy { + type Bytes: AsRef<[u8]>; + fn to_le_bytes(self) -> Self::Bytes; +} +impl ToLe for u32 { + type Bytes = [u8; 4]; + fn to_le_bytes(self) -> Self::Bytes { + self.to_le_bytes() + } +} +impl ToLe for u64 { + type Bytes = [u8; 8]; + fn to_le_bytes(self) -> Self::Bytes { + self.to_le_bytes() + } +} + +fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { + let size = core::mem::size_of::(); + let chunk_size_u8 = min(src.len() * size, dest.len()); + let chunk_size = (chunk_size_u8 + size - 1) / size; + + if cfg!(target_endian = "little") { + // On LE we can do a simple copy, which is 25-50% faster: + unsafe { + core::ptr::copy_nonoverlapping( + src.as_ptr() as *const u8, + dest.as_mut_ptr(), + chunk_size_u8, + ); + } + } else { + // This code is valid on all arches, but slower than the above: + let mut i = 0; + let mut iter = dest[..chunk_size_u8].chunks_exact_mut(size); + while let Some(chunk) = iter.next() { + chunk.copy_from_slice(src[i].to_le_bytes().as_ref()); + i += 1; } + let chunk = iter.into_remainder(); + if !chunk.is_empty() { + chunk.copy_from_slice(&src[i].to_le_bytes().as_ref()[..chunk.len()]); + } + } - (chunk_size, chunk_size_u8) - }}; + (chunk_size, chunk_size_u8) } /// Implement `fill_bytes` by reading chunks from the output buffer of a block @@ -115,7 +131,7 @@ macro_rules! fill_via_chunks { /// } /// ``` pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) { - fill_via_chunks!(src, dest, u32) + fill_via_chunks(src, dest) } /// Implement `fill_bytes` by reading chunks from the output buffer of a block @@ -129,7 +145,7 @@ pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) { /// /// See `fill_via_u32_chunks` for an example. pub fn fill_via_u64_chunks(src: &[u64], dest: &mut [u8]) -> (usize, usize) { - fill_via_chunks!(src, dest, u64) + fill_via_chunks(src, dest) } /// Implement `next_u32` via `fill_bytes`, little-endian order. From 93ade1a009b66476491f0521a56e5dd97a4791bc Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 14 Sep 2021 08:11:53 +0100 Subject: [PATCH 3/5] fill_via_chunks: better value names --- rand_core/src/impls.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs index bd21b7c731e..7c09b5488ac 100644 --- a/rand_core/src/impls.rs +++ b/rand_core/src/impls.rs @@ -71,8 +71,8 @@ impl ToLe for u64 { fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { let size = core::mem::size_of::(); - let chunk_size_u8 = min(src.len() * size, dest.len()); - let chunk_size = (chunk_size_u8 + size - 1) / size; + let byte_len = min(src.len() * size, dest.len()); + let num_chunks = (byte_len + size - 1) / size; if cfg!(target_endian = "little") { // On LE we can do a simple copy, which is 25-50% faster: @@ -80,13 +80,13 @@ fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { core::ptr::copy_nonoverlapping( src.as_ptr() as *const u8, dest.as_mut_ptr(), - chunk_size_u8, + byte_len, ); } } else { // This code is valid on all arches, but slower than the above: let mut i = 0; - let mut iter = dest[..chunk_size_u8].chunks_exact_mut(size); + let mut iter = dest[..byte_len].chunks_exact_mut(size); while let Some(chunk) = iter.next() { chunk.copy_from_slice(src[i].to_le_bytes().as_ref()); i += 1; @@ -97,7 +97,7 @@ fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { } } - (chunk_size, chunk_size_u8) + (num_chunks, byte_len) } /// Implement `fill_bytes` by reading chunks from the output buffer of a block From 19b7a76481c23497aff33fe7024cac4564d5196e Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 14 Sep 2021 08:17:26 +0100 Subject: [PATCH 4/5] fill_via_chunks: make ToLe an unsafe trait --- rand_core/src/impls.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs index 7c09b5488ac..1105094221e 100644 --- a/rand_core/src/impls.rs +++ b/rand_core/src/impls.rs @@ -52,17 +52,19 @@ pub fn fill_bytes_via_next(rng: &mut R, dest: &mut [u8]) { } } -trait ToLe: Copy { +/// Contract: implementing type must be memory-safe to observe as a byte array +/// (implies no uninitialised padding). +unsafe trait ToLe: Copy { type Bytes: AsRef<[u8]>; fn to_le_bytes(self) -> Self::Bytes; } -impl ToLe for u32 { +unsafe impl ToLe for u32 { type Bytes = [u8; 4]; fn to_le_bytes(self) -> Self::Bytes { self.to_le_bytes() } } -impl ToLe for u64 { +unsafe impl ToLe for u64 { type Bytes = [u8; 8]; fn to_le_bytes(self) -> Self::Bytes { self.to_le_bytes() From 34a8f13d863a79be9d41f2ef63dc3b85b156c9dc Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 15 Sep 2021 09:27:11 +0100 Subject: [PATCH 5/5] Replace ToLe with Observable, including as_byte_slice method --- rand_core/src/impls.rs | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs index 1105094221e..12838dd138d 100644 --- a/rand_core/src/impls.rs +++ b/rand_core/src/impls.rs @@ -52,39 +52,44 @@ pub fn fill_bytes_via_next(rng: &mut R, dest: &mut [u8]) { } } -/// Contract: implementing type must be memory-safe to observe as a byte array -/// (implies no uninitialised padding). -unsafe trait ToLe: Copy { +trait Observable: Copy { type Bytes: AsRef<[u8]>; fn to_le_bytes(self) -> Self::Bytes; + + // Contract: observing self is memory-safe (implies no uninitialised padding) + fn as_byte_slice(x: &[Self]) -> &[u8]; } -unsafe impl ToLe for u32 { +impl Observable for u32 { type Bytes = [u8; 4]; fn to_le_bytes(self) -> Self::Bytes { self.to_le_bytes() } + fn as_byte_slice(x: &[Self]) -> &[u8] { + let ptr = x.as_ptr() as *const u8; + let len = x.len() * core::mem::size_of::(); + unsafe { core::slice::from_raw_parts(ptr, len) } + } } -unsafe impl ToLe for u64 { +impl Observable for u64 { type Bytes = [u8; 8]; fn to_le_bytes(self) -> Self::Bytes { self.to_le_bytes() } + fn as_byte_slice(x: &[Self]) -> &[u8] { + let ptr = x.as_ptr() as *const u8; + let len = x.len() * core::mem::size_of::(); + unsafe { core::slice::from_raw_parts(ptr, len) } + } } -fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { +fn fill_via_chunks(src: &[T], dest: &mut [u8]) -> (usize, usize) { let size = core::mem::size_of::(); let byte_len = min(src.len() * size, dest.len()); let num_chunks = (byte_len + size - 1) / size; if cfg!(target_endian = "little") { // On LE we can do a simple copy, which is 25-50% faster: - unsafe { - core::ptr::copy_nonoverlapping( - src.as_ptr() as *const u8, - dest.as_mut_ptr(), - byte_len, - ); - } + dest[..byte_len].copy_from_slice(&T::as_byte_slice(&src[..num_chunks])[..byte_len]); } else { // This code is valid on all arches, but slower than the above: let mut i = 0;